penguins
## # A tibble: 344 × 7
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ℹ 334 more rows
## # ℹ 1 more variable: sex <fct>
glimpse(penguins)
## Rows: 344
## Columns: 7
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
?penguins
## Help on topic 'penguins' was found in the following packages:
##
## Package Library
## modeldata /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## palmerpenguins /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##
##
## Using the first match ...
ggplot(
data = penguins, # data = optional
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) + # mapping = optional, inside first () so passed globally
geom_point(mapping = aes(color = species, shape = species)) + # define colors for just points
geom_smooth(method = "lm") +
labs(
title = "Body mass and flipper length",
subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
x = "Flipper length (mm)",
y = "Body mass (g)",
color = "Species",
shape = "Species",
caption = "Data come from the palmerpenguins package"
) +
scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
dim(penguins) # 344 rows, 8 columns
## [1] 344 7
?penguins # a number denoting bill depth (millimeters) <- thickness
## Help on topic 'penguins' was found in the following packages:
##
## Package Library
## modeldata /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## palmerpenguins /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##
##
## Using the first match ...
ggplot(
data = penguins,
mapping = aes(x = bill_length_mm, y = bill_depth_mm)
) +
geom_point() + # appearas positive
labs(
title = "Bill length and bill depth",
x = "Bill length (mm)",
y = "Bill depth (mm)"
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(x = bill_length_mm, y = species)
) +
geom_point() +
labs(
title = "Bill length and species",
x = "Bill length (mm)",
y = "Species"
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(x = bill_length_mm, y = species)
) +
geom_boxplot() +
labs(
title = "Bill length and species",
x = "Bill length (mm)",
y = "Species"
)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggplot(
data = penguins,
mapping = aes(x = bill_length_mm, y = species)
) +
geom_boxplot(na.rm = T) + # ignore missing values
labs(
title = "Bill length and species",
x = "Bill length (mm)",
y = "Species"
)
colSums(is.na(penguins)) # get missing values
## species island bill_length_mm bill_depth_mm
## 0 0 2 2
## flipper_length_mm body_mass_g sex
## 2 2 11
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(mapping = aes(color = bill_depth_mm)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot() +
geom_point(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_smooth(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Chapter 1
# verbose
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# concise
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# future pipe
penguins |>
ggplot(aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = species)) +
geom_bar()
# reorder bt frequency by change to factor
ggplot(penguins, aes(x = fct_infreq(species))) +
geom_bar()
# 1.4.2 A numerical variable
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 200)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
# play with bin width
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 20)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 2000)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
# denskity plot
ggplot(penguins, aes(x = body_mass_g)) +
geom_density()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(penguins, aes(y = species)) + # horizatonal when y =
geom_bar()
ggplot(penguins, aes(x = species)) +
geom_bar(color = "red") # outline red
ggplot(penguins, aes(x = species)) +
geom_bar(fill = "red") # fill in red
# binwidth width of each bar in terms of x units
ggplot(diamonds, aes(x = carat)) +
geom_histogram(binwidth = 0.05)
ggplot(diamonds, aes(x = carat)) +
geom_histogram(binwidth = 0.50)
# boxplot
ggplot(penguins, aes(x = species, y = body_mass_g)) +
geom_boxplot()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# density plot, customized the thickness of the lines using the linewidth argument in order to make them stand out a bit more against the background.
ggplot(penguins, aes(x = body_mass_g, color = species)) +
geom_density(linewidth = 0.75)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
# alpha aesthetic to add transparency to the filled density curves.
ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
geom_density(alpha = 0.5)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
# 1.5.2 Two categorical variables
# difficlut to interpet, different bar siaes
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar()
# relative frequency plot created by setting position = "fill" in the geom, is more useful for comparing species distributions across islands since it’s not affected by the unequal numbers of penguins across the islands
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar(position = "fill")
# 1.5.3 Two numerical variables
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# 3 or more variables, butcluttered and difficult ot read
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# solutoin1 facets!
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = species, shape = species)) +
facet_wrap(~island) # seperate plots via island
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
mpg
## # A tibble: 234 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
## # ℹ 224 more rows
?mpg
ggplot(
mpg,
aes(x = hwy, y = displ, size = cty)
) +
geom_point()
ggplot(
mpg,
aes(x = hwy, y = displ, size = cty, color = cty)
) +
geom_point()
ggplot(
mpg,
aes(x = hwy, y = displ, size = cty, color = cty, shape = drv)
) +
geom_point()
ggplot(
mpg,
aes(x = hwy, y = displ, size = cty, color = cty, shape = drv, linewidth = cty) # linewidth is ignored
) +
geom_point()
ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) +
geom_point(aes(color = species)) +
facet_wrap(~species)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(
x = bill_length_mm, y = bill_depth_mm,
color = species, shape = species
)
) +
geom_point() +
labs(color = "species") # lower case to match var name
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar(position = "fill") # differences between islands
ggplot(penguins, aes(x = species, fill = island)) +
geom_bar(position = "fill") # differences between species
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave(filename = "plots/penguin-plot.svg")
## Saving 8 x 5 in image
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(mpg, aes(x = class)) +
geom_bar()
ggplot(mpg, aes(x = cty, y = hwy)) +
geom_point()
ggsave("plots/mpg-plot.pdf") # saves last run, remember to use f1 for help
## Saving 8 x 5 in image
my_variable <- 10
my_variable # typo! shouod be my_variable
## [1] 10
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
# option shift k for keyboard shortcuts
my_bar_plot <- ggplot(mpg, aes(x = class)) +
geom_bar()
my_bar_plot
my_scatter_plot <- ggplot(mpg, aes(x = cty, y = hwy)) +
geom_point()
my_scatter_plot
ggsave(filename = "plots/mpg-plot.png", plot = my_bar_plot) # my bar plot is saved
## Saving 8 x 5 in image
Ch 3
if (!require("nycflights13")) install.packages("nycflights13")
## Loading required package: nycflights13
if (!require("Lahman")) install.packages("Lahman")
## Loading required package: Lahman
library(nycflights13)
library(tidyverse)
library(dplyr)
library(Lahman)
# Conflicts with filter and lag. If you want to use the base version of these functions after loading dplyr, you’ll need to use their full names: stats::filter() and stats::lag().
?flights
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
glimpse(flights) # <int> is short for integer, <dbl> is short for double (aka real numbers), <chr> for character (aka strings), and <dttm> for date-time.
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights |>
filter(dest == "IAH") |>
group_by(year, month, day) |>
summarize(
arr_delay = mean(arr_delay, na.rm = TRUE)
)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day arr_delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 17.8
## 2 2013 1 2 7
## 3 2013 1 3 18.3
## 4 2013 1 4 -3.2
## 5 2013 1 5 20.2
## 6 2013 1 6 9.28
## 7 2013 1 7 -7.74
## 8 2013 1 8 7.79
## 9 2013 1 9 18.1
## 10 2013 1 10 6.68
## # ℹ 355 more rows
# find all flights that departed more than 120 minutes (two hours) late:
flights |>
filter(dep_delay > 120)
## # A tibble: 9,723 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 848 1835 853 1001 1950
## 2 2013 1 1 957 733 144 1056 853
## 3 2013 1 1 1114 900 134 1447 1222
## 4 2013 1 1 1540 1338 122 2020 1825
## 5 2013 1 1 1815 1325 290 2120 1542
## 6 2013 1 1 1842 1422 260 1958 1535
## 7 2013 1 1 1856 1645 131 2212 2005
## 8 2013 1 1 1934 1725 129 2126 1855
## 9 2013 1 1 1938 1703 155 2109 1823
## 10 2013 1 1 1942 1705 157 2124 1830
## # ℹ 9,713 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Flights that departed on January 1
flights |>
filter(month == 1 & day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Flights that departed in January or February
flights |>
filter(month == 1 | month == 2)
## # A tibble: 51,955 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 51,945 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# A shorter way to select flights that departed in January or February
flights |>
filter(month %in% c(1, 2))
## # A tibble: 51,955 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 51,945 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
filter(month %in% c(11, 12))
## # A tibble: 55,403 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 11 1 5 2359 6 352 345
## 2 2013 11 1 35 2250 105 123 2356
## 3 2013 11 1 455 500 -5 641 651
## 4 2013 11 1 539 545 -6 856 827
## 5 2013 11 1 542 545 -3 831 855
## 6 2013 11 1 549 600 -11 912 923
## 7 2013 11 1 550 600 -10 705 659
## 8 2013 11 1 554 600 -6 659 701
## 9 2013 11 1 554 600 -6 826 827
## 10 2013 11 1 554 600 -6 749 751
## # ℹ 55,393 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
feb <- flights |>
filter(month == 2)
feb
## # A tibble: 24,951 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 2 1 456 500 -4 652 648
## 2 2013 2 1 520 525 -5 816 820
## 3 2013 2 1 527 530 -3 837 829
## 4 2013 2 1 532 540 -8 1007 1017
## 5 2013 2 1 540 540 0 859 850
## 6 2013 2 1 552 600 -8 714 715
## 7 2013 2 1 552 600 -8 919 910
## 8 2013 2 1 552 600 -8 655 709
## 9 2013 2 1 553 600 -7 833 815
## 10 2013 2 1 553 600 -7 821 825
## # ℹ 24,941 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
arrange(year, month, day, dep_delay, dep_time)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 940 955 -15 1226 1220
## 2 2013 1 1 2030 2045 -15 2150 2225
## 3 2013 1 1 1716 1730 -14 1947 1953
## 4 2013 1 1 946 959 -13 1146 1202
## 5 2013 1 1 2217 2229 -12 249 315
## 6 2013 1 1 839 850 -11 1027 1035
## 7 2013 1 1 1849 1900 -11 2131 2129
## 8 2013 1 1 800 810 -10 949 955
## 9 2013 1 1 805 815 -10 1006 1010
## 10 2013 1 1 820 830 -10 940 954
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
arrange(desc(dep_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# 3.2.4 distinct()
flights |>
distinct() # remove any duplicate rows
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Find all unique origin and destination pairs
flights |>
distinct(origin, dest)
## # A tibble: 224 × 2
## origin dest
## <chr> <chr>
## 1 EWR IAH
## 2 LGA IAH
## 3 JFK MIA
## 4 JFK BQN
## 5 LGA ATL
## 6 EWR ORD
## 7 EWR FLL
## 8 LGA IAD
## 9 JFK MCO
## 10 LGA ORD
## # ℹ 214 more rows
# eep other columns when filtering for unique rows, you can use the .keep_all = TRUE option.
flights |>
distinct(origin, dest, .keep_all = TRUE)
## # A tibble: 224 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 214 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
count(origin, dest, sort = TRUE)
## # A tibble: 224 × 3
## origin dest n
## <chr> <chr> <int>
## 1 JFK LAX 11262
## 2 LGA ATL 10263
## 3 LGA ORD 8857
## 4 JFK SFO 8204
## 5 LGA CLT 6168
## 6 EWR ORD 6100
## 7 JFK BOS 5898
## 8 LGA MIA 5781
## 9 JFK MCO 5464
## 10 EWR BOS 5327
## # ℹ 214 more rows
flights |>
count(year, month, sort = TRUE)
## # A tibble: 12 × 3
## year month n
## <int> <int> <int>
## 1 2013 7 29425
## 2 2013 8 29327
## 3 2013 10 28889
## 4 2013 3 28834
## 5 2013 5 28796
## 6 2013 4 28330
## 7 2013 6 28243
## 8 2013 12 28135
## 9 2013 9 27574
## 10 2013 11 27268
## 11 2013 1 27004
## 12 2013 2 24951
flights |>
filter(dep_delay >= 120)
## # A tibble: 9,888 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 848 1835 853 1001 1950
## 2 2013 1 1 957 733 144 1056 853
## 3 2013 1 1 1114 900 134 1447 1222
## 4 2013 1 1 1540 1338 122 2020 1825
## 5 2013 1 1 1815 1325 290 2120 1542
## 6 2013 1 1 1842 1422 260 1958 1535
## 7 2013 1 1 1856 1645 131 2212 2005
## 8 2013 1 1 1934 1725 129 2126 1855
## 9 2013 1 1 1938 1703 155 2109 1823
## 10 2013 1 1 1942 1705 157 2124 1830
## # ℹ 9,878 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
filter(dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ℹ 9,303 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
filter(carrier %in% c("UA", "AA", "DL"))
## # A tibble: 139,504 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 600 -1 941 910
## 10 2013 1 1 559 600 -1 854 902
## # ℹ 139,494 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
filter(month %in% c(6, 7, 8))
## # A tibble: 86,995 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 6 1 2 2359 3 341 350
## 2 2013 6 1 451 500 -9 624 640
## 3 2013 6 1 506 515 -9 715 800
## 4 2013 6 1 534 545 -11 800 829
## 5 2013 6 1 538 545 -7 925 922
## 6 2013 6 1 539 540 -1 832 840
## 7 2013 6 1 546 600 -14 850 910
## 8 2013 6 1 551 600 -9 828 850
## 9 2013 6 1 552 600 -8 647 655
## 10 2013 6 1 553 600 -7 700 711
## # ℹ 86,985 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
filter(arr_delay > 120 & dep_delay == 0)
## # A tibble: 3 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 7 1350 1350 0 1736 1526
## 2 2013 5 23 1810 1810 0 2208 2000
## 3 2013 7 1 905 905 0 1443 1223
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
arrange(desc(dep_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
arrange(dep_time)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 13 1 2249 72 108 2357
## 2 2013 1 31 1 2100 181 124 2225
## 3 2013 11 13 1 2359 2 442 440
## 4 2013 12 16 1 2359 2 447 437
## 5 2013 12 20 1 2359 2 430 440
## 6 2013 12 26 1 2359 2 437 440
## 7 2013 12 30 1 2359 2 441 437
## 8 2013 2 11 1 2100 181 111 2225
## 9 2013 2 24 1 2245 76 121 2354
## 10 2013 3 8 1 2355 6 431 440
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
arrange(arr_time - dep_time) |>
relocate(dep_time, arr_time)
## # A tibble: 336,776 × 19
## dep_time arr_time year month day sched_dep_time dep_delay sched_arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 2400 54 2013 7 17 2142 138 2259
## 2 2400 59 2013 12 9 2250 70 2356
## 3 2338 17 2013 6 12 2129 129 2235
## 4 2332 14 2013 12 29 2155 97 2300
## 5 2335 18 2013 11 6 2215 80 2317
## 6 2347 30 2013 2 25 2145 122 2239
## 7 2351 35 2013 8 13 2152 119 2258
## 8 2342 27 2013 10 11 2030 192 2205
## 9 2356 41 2013 2 26 2000 236 2104
## 10 2342 28 2013 1 24 2159 103 2300
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
distinct(month, day) |>
arrange(month, day) |>
count() # OR nrow()
## # A tibble: 1 × 1
## n
## <int>
## 1 365
flights |>
arrange(desc(distance)) |>
relocate(distance, origin, dest) # JFK to HNL
## # A tibble: 336,776 × 19
## distance origin dest year month day dep_time sched_dep_time dep_delay
## <dbl> <chr> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 4983 JFK HNL 2013 1 1 857 900 -3
## 2 4983 JFK HNL 2013 1 2 909 900 9
## 3 4983 JFK HNL 2013 1 3 914 900 14
## 4 4983 JFK HNL 2013 1 4 900 900 0
## 5 4983 JFK HNL 2013 1 5 858 900 -2
## 6 4983 JFK HNL 2013 1 6 1019 900 79
## 7 4983 JFK HNL 2013 1 7 1042 900 102
## 8 4983 JFK HNL 2013 1 8 901 900 1
## 9 4983 JFK HNL 2013 1 9 641 900 1301
## 10 4983 JFK HNL 2013 1 10 859 900 -1
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, air_time <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights |>
arrange(distance) |>
relocate(distance, origin, dest) # EWR to PHL
## # A tibble: 336,776 × 19
## distance origin dest year month day dep_time sched_dep_time dep_delay
## <dbl> <chr> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 17 EWR LGA 2013 7 27 NA 106 NA
## 2 80 EWR PHL 2013 1 3 2127 2129 -2
## 3 80 EWR PHL 2013 1 4 1240 1200 40
## 4 80 EWR PHL 2013 1 4 1829 1615 134
## 5 80 EWR PHL 2013 1 4 2128 2129 -1
## 6 80 EWR PHL 2013 1 5 1155 1200 -5
## 7 80 EWR PHL 2013 1 6 2125 2129 -4
## 8 80 EWR PHL 2013 1 7 2124 2129 -5
## 9 80 EWR PHL 2013 1 8 2127 2130 -3
## 10 80 EWR PHL 2013 1 9 2126 2129 -3
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, air_time <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# Order does not matter
# mutate
flights |>
mutate(
gain = dep_delay - arr_delay,
speed = distance / air_time * 60
) |>
relocate(gain, speed)
## # A tibble: 336,776 × 21
## gain speed year month day dep_time sched_dep_time dep_delay arr_time
## <dbl> <dbl> <int> <int> <int> <int> <int> <dbl> <int>
## 1 -9 370. 2013 1 1 517 515 2 830
## 2 -16 374. 2013 1 1 533 529 4 850
## 3 -31 408. 2013 1 1 542 540 2 923
## 4 17 517. 2013 1 1 544 545 -1 1004
## 5 19 394. 2013 1 1 554 600 -6 812
## 6 -16 288. 2013 1 1 554 558 -4 740
## 7 -24 404. 2013 1 1 555 600 -5 913
## 8 11 259. 2013 1 1 557 600 -3 709
## 9 5 405. 2013 1 1 557 600 -3 838
## 10 -10 319. 2013 1 1 558 600 -2 753
## # ℹ 336,766 more rows
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# add to left hand side after mutating
# mutate
flights |>
mutate(
gain = dep_delay - arr_delay,
speed = distance / air_time * 60,
before = 1
) # equivalent to relocate gain, speed
## # A tibble: 336,776 × 22
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 14 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>,
## # before <dbl>
# add after day variable, . is a sign that .before is an argument to the mutate function, not a variable name
flights |>
mutate(
gain = dep_delay - arr_delay,
speed = distance / air_time * 60,
.after = day
)
## # A tibble: 336,776 × 21
## year month day gain speed dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 -9 370. 517 515 2 830
## 2 2013 1 1 -16 374. 533 529 4 850
## 3 2013 1 1 -31 408. 542 540 2 923
## 4 2013 1 1 17 517. 544 545 -1 1004
## 5 2013 1 1 19 394. 554 600 -6 812
## 6 2013 1 1 -16 288. 554 558 -4 740
## 7 2013 1 1 -24 404. 555 600 -5 913
## 8 2013 1 1 11 259. 557 600 -3 709
## 9 2013 1 1 5 405. 557 600 -3 838
## 10 2013 1 1 -10 319. 558 600 -2 753
## # ℹ 336,766 more rows
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# only keep affected variables
df_delay_gain <- flights |>
mutate(
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours,
.keep = "used"
)
df_delay_gain
## # A tibble: 336,776 × 6
## dep_delay arr_delay air_time gain hours gain_per_hour
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 11 227 -9 3.78 -2.38
## 2 4 20 227 -16 3.78 -4.23
## 3 2 33 160 -31 2.67 -11.6
## 4 -1 -18 183 17 3.05 5.57
## 5 -6 -25 116 19 1.93 9.83
## 6 -4 12 150 -16 2.5 -6.4
## 7 -5 19 158 -24 2.63 -9.11
## 8 -3 -14 53 11 0.883 12.5
## 9 -3 -8 140 5 2.33 2.14
## 10 -2 8 138 -10 2.3 -4.35
## # ℹ 336,766 more rows
# 3.3.2 select()
# selet columns by name
flights |>
select(year, month, day, carrier)
## # A tibble: 336,776 × 4
## year month day carrier
## <int> <int> <int> <chr>
## 1 2013 1 1 UA
## 2 2013 1 1 UA
## 3 2013 1 1 AA
## 4 2013 1 1 B6
## 5 2013 1 1 DL
## 6 2013 1 1 UA
## 7 2013 1 1 B6
## 8 2013 1 1 EV
## 9 2013 1 1 B6
## 10 2013 1 1 AA
## # ℹ 336,766 more rows
# select all columns between variables
flights |>
select(flight:dest)
## # A tibble: 336,776 × 4
## flight tailnum origin dest
## <int> <chr> <chr> <chr>
## 1 1545 N14228 EWR IAH
## 2 1714 N24211 LGA IAH
## 3 1141 N619AA JFK MIA
## 4 725 N804JB JFK BQN
## 5 461 N668DN LGA ATL
## 6 1696 N39463 EWR ORD
## 7 507 N516JB EWR FLL
## 8 5708 N829AS LGA IAD
## 9 79 N593JB JFK MCO
## 10 301 N3ALAA LGA ORD
## # ℹ 336,766 more rows
# select all columns except variables
flights |>
select(!year:day) # Historically this operation was done with - instead of !, so you’re likely to see that in the wild. These two operators serve the same purpose but with subtle differences in behavior. We recommend using ! because it reads as “not” and combines well with & and |.
## # A tibble: 336,776 × 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
## <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 517 515 2 830 819 11 UA
## 2 533 529 4 850 830 20 UA
## 3 542 540 2 923 850 33 AA
## 4 544 545 -1 1004 1022 -18 B6
## 5 554 600 -6 812 837 -25 DL
## 6 554 558 -4 740 728 12 UA
## 7 555 600 -5 913 854 19 B6
## 8 557 600 -3 709 723 -14 EV
## 9 557 600 -3 838 846 -8 B6
## 10 558 600 -2 753 745 8 AA
## # ℹ 336,766 more rows
## # ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# select all columns with characters
flights |>
select(where(is.character))
## # A tibble: 336,776 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
## # ℹ 336,766 more rows
# rename variables
flights |>
select(tail_num = tailnum) # new name on left, old on right
## # A tibble: 336,776 × 1
## tail_num
## <chr>
## 1 N14228
## 2 N24211
## 3 N619AA
## 4 N804JB
## 5 N668DN
## 6 N39463
## 7 N516JB
## 8 N829AS
## 9 N593JB
## 10 N3ALAA
## # ℹ 336,766 more rows
# further oin relocation
flights |>
relocate(year:dep_time, .after = time_hour)
## # A tibble: 336,776 × 19
## sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight
## <int> <dbl> <int> <int> <dbl> <chr> <int>
## 1 515 2 830 819 11 UA 1545
## 2 529 4 850 830 20 UA 1714
## 3 540 2 923 850 33 AA 1141
## 4 545 -1 1004 1022 -18 B6 725
## 5 600 -6 812 837 -25 DL 461
## 6 558 -4 740 728 12 UA 1696
## 7 600 -5 913 854 19 B6 507
## 8 600 -3 709 723 -14 EV 5708
## 9 600 -3 838 846 -8 B6 79
## 10 600 -2 753 745 8 AA 301
## # ℹ 336,766 more rows
## # ℹ 12 more variables: tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, year <int>,
## # month <int>, day <int>, dep_time <int>
flights |>
relocate(starts_with("arr"), .before = dep_time)
## # A tibble: 336,776 × 19
## year month day arr_time arr_delay dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <dbl> <int> <int> <dbl>
## 1 2013 1 1 830 11 517 515 2
## 2 2013 1 1 850 20 533 529 4
## 3 2013 1 1 923 33 542 540 2
## 4 2013 1 1 1004 -18 544 545 -1
## 5 2013 1 1 812 -25 554 600 -6
## 6 2013 1 1 740 12 554 558 -4
## 7 2013 1 1 913 19 555 600 -5
## 8 2013 1 1 709 -14 557 600 -3
## 9 2013 1 1 838 -8 557 600 -3
## 10 2013 1 1 753 8 558 600 -2
## # ℹ 336,766 more rows
## # ℹ 11 more variables: sched_arr_time <int>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
select(dep_time, sched_dep_time, dep_delay) # dep_delay diff between dep_time and sched
## # A tibble: 336,776 × 3
## dep_time sched_dep_time dep_delay
## <int> <int> <dbl>
## 1 517 515 2
## 2 533 529 4
## 3 542 540 2
## 4 544 545 -1
## 5 554 600 -6
## 6 554 558 -4
## 7 555 600 -5
## 8 557 600 -3
## 9 557 600 -3
## 10 558 600 -2
## # ℹ 336,766 more rows
flights |>
select(dep_time:dep_delay)
## # A tibble: 336,776 × 3
## dep_time sched_dep_time dep_delay
## <int> <int> <dbl>
## 1 517 515 2
## 2 533 529 4
## 3 542 540 2
## 4 544 545 -1
## 5 554 600 -6
## 6 554 558 -4
## 7 555 600 -5
## 8 557 600 -3
## 9 557 600 -3
## 10 558 600 -2
## # ℹ 336,766 more rows
flights |>
select(starts_with("dep") | starts_with("arr"))
## # A tibble: 336,776 × 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ℹ 336,766 more rows
flights |>
select(month, month, month)
## # A tibble: 336,776 × 1
## month
## <int>
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## # ℹ 336,766 more rows
variables <- c("year", "month", "day", "dep_delay", "arr_delay")
flights |>
select(any_of(variables))
## # A tibble: 336,776 × 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # ℹ 336,766 more rows
flights |>
select(contains("TIME")) # by default, case ignored
## # A tibble: 336,776 × 6
## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
## <int> <int> <int> <int> <dbl> <dttm>
## 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 533 529 850 830 227 2013-01-01 05:00:00
## 3 542 540 923 850 160 2013-01-01 05:00:00
## 4 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 554 600 812 837 116 2013-01-01 06:00:00
## 6 554 558 740 728 150 2013-01-01 05:00:00
## 7 555 600 913 854 158 2013-01-01 06:00:00
## 8 557 600 709 723 53 2013-01-01 06:00:00
## 9 557 600 838 846 140 2013-01-01 06:00:00
## 10 558 600 753 745 138 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights |>
rename(air_time_min = air_time) |>
relocate(air_time_min, .before = 1)
## # A tibble: 336,776 × 19
## air_time_min year month day dep_time sched_dep_time dep_delay arr_time
## <dbl> <int> <int> <int> <int> <int> <dbl> <int>
## 1 227 2013 1 1 517 515 2 830
## 2 227 2013 1 1 533 529 4 850
## 3 160 2013 1 1 542 540 2 923
## 4 183 2013 1 1 544 545 -1 1004
## 5 116 2013 1 1 554 600 -6 812
## 6 150 2013 1 1 554 558 -4 740
## 7 158 2013 1 1 555 600 -5 913
## 8 53 2013 1 1 557 600 -3 709
## 9 140 2013 1 1 557 600 -3 838
## 10 138 2013 1 1 558 600 -2 753
## # ℹ 336,766 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
select(tailnum)
## # A tibble: 336,776 × 1
## tailnum
## <chr>
## 1 N14228
## 2 N24211
## 3 N619AA
## 4 N804JB
## 5 N668DN
## 6 N39463
## 7 N516JB
## 8 N829AS
## 9 N593JB
## 10 N3ALAA
## # ℹ 336,766 more rows
# arrange(arr_delay) # because tailnum was selected, so arr_delay was booted
flights |>
filter(dest == "IAH") |>
mutate(speed = distance / air_time * 60) |>
select(year:day, dep_time, carrier, flight, speed) |>
arrange(desc(speed))
## # A tibble: 7,198 × 7
## year month day dep_time carrier flight speed
## <int> <int> <int> <int> <chr> <int> <dbl>
## 1 2013 7 9 707 UA 226 522.
## 2 2013 8 27 1850 UA 1128 521.
## 3 2013 8 28 902 UA 1711 519.
## 4 2013 8 28 2122 UA 1022 519.
## 5 2013 6 11 1628 UA 1178 515.
## 6 2013 8 27 1017 UA 333 515.
## 7 2013 8 27 1205 UA 1421 515.
## 8 2013 8 27 1758 UA 302 515.
## 9 2013 9 27 521 UA 252 515.
## 10 2013 8 28 625 UA 559 515.
## # ℹ 7,188 more rows
flights |>
group_by(month)
## # A tibble: 336,776 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# 3.5.2 summarize()
flights |>
group_by(month) |>
summarize(
avg_delay = mean(dep_delay) # uh oh, missing data!
)
## # A tibble: 12 × 2
## month avg_delay
## <int> <dbl>
## 1 1 NA
## 2 2 NA
## 3 3 NA
## 4 4 NA
## 5 5 NA
## 6 6 NA
## 7 7 NA
## 8 8 NA
## 9 9 NA
## 10 10 NA
## 11 11 NA
## 12 12 NA
flights |>
group_by(month) |>
summarize(
avg_delay = mean(dep_delay, na.rm = TRUE)
)
## # A tibble: 12 × 2
## month avg_delay
## <int> <dbl>
## 1 1 10.0
## 2 2 10.8
## 3 3 13.2
## 4 4 13.9
## 5 5 13.0
## 6 6 20.8
## 7 7 21.7
## 8 8 12.6
## 9 9 6.72
## 10 10 6.24
## 11 11 5.44
## 12 12 16.6
flights |>
group_by(month) |>
summarize(
avg_delay = mean(dep_delay, na.rm = TRUE),
n = n() # number of rows in each group
)
## # A tibble: 12 × 3
## month avg_delay n
## <int> <dbl> <int>
## 1 1 10.0 27004
## 2 2 10.8 24951
## 3 3 13.2 28834
## 4 4 13.9 28330
## 5 5 13.0 28796
## 6 6 20.8 28243
## 7 7 21.7 29425
## 8 8 12.6 29327
## 9 9 6.72 27574
## 10 10 6.24 28889
## 11 11 5.44 27268
## 12 12 16.6 28135
# 3.5.3 The slice_ functions
# df |> slice_head(n = 1) takes the first row from each group.
# df |> slice_tail(n = 1) takes the last row in each group.
# df |> slice_min(x, n = 1) takes the row with the smallest value of column x.
# df |> slice_max(x, n = 1) takes the row with the largest value of column x.
# df |> slice_sample(n = 1) takes one random row.
flights |>
group_by(dest) |>
slice_max(arr_delay, n = 1) |> # takes the row with the largest value of column arr_delay
relocate(dest) # Note that there are 105 destinations but we get 108 rows here. What’s up? slice_min() and slice_max() keep tied values so n = 1 means give us all rows with the highest value. If you want exactly one row per group you can set with_ties = FALSE.
## # A tibble: 108 × 19
## # Groups: dest [105]
## dest year month day dep_time sched_dep_time dep_delay arr_time
## <chr> <int> <int> <int> <int> <int> <dbl> <int>
## 1 ABQ 2013 7 22 2145 2007 98 132
## 2 ACK 2013 7 23 1139 800 219 1250
## 3 ALB 2013 1 25 123 2000 323 229
## 4 ANC 2013 8 17 1740 1625 75 2042
## 5 ATL 2013 7 22 2257 759 898 121
## 6 AUS 2013 7 10 2056 1505 351 2347
## 7 AVL 2013 8 13 1156 832 204 1417
## 8 BDL 2013 2 21 1728 1316 252 1839
## 9 BGR 2013 12 1 1504 1056 248 1628
## 10 BHM 2013 4 10 25 1900 325 136
## # ℹ 98 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
group_by(dest) |>
slice_max(arr_delay, n = 1, with_ties = F) |> # no ties, only one row per group
relocate(dest)
## # A tibble: 105 × 19
## # Groups: dest [105]
## dest year month day dep_time sched_dep_time dep_delay arr_time
## <chr> <int> <int> <int> <int> <int> <dbl> <int>
## 1 ABQ 2013 7 22 2145 2007 98 132
## 2 ACK 2013 7 23 1139 800 219 1250
## 3 ALB 2013 1 25 123 2000 323 229
## 4 ANC 2013 8 17 1740 1625 75 2042
## 5 ATL 2013 7 22 2257 759 898 121
## 6 AUS 2013 7 10 2056 1505 351 2347
## 7 AVL 2013 8 13 1156 832 204 1417
## 8 BDL 2013 2 21 1728 1316 252 1839
## 9 BGR 2013 12 1 1504 1056 248 1628
## 10 BHM 2013 4 10 25 1900 325 136
## # ℹ 95 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# This is similar to computing the max delay with summarize(), but you get the whole corresponding row (or rows if there’s a tie) instead of the single summary statistic.
daily <- flights |>
group_by(year, month, day) # multiple group variables, each summary peels off the last group.
daily
## # A tibble: 336,776 × 19
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
daily_flights <- daily |>
summarize(n = n()) # explains how one group was peeled off (day), now only 2 left
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
daily_flights <- daily |>
summarize(
n = n(),
.groups = "drop_last" # surpress warning about peeling offf groups
)
# summarize ungrouped data
daily |>
ungroup() |>
summarize( # only one row becuase ungrouped data is treated as one group
avg_delay = mean(dep_delay, na.rm = TRUE),
flights = n()
)
## # A tibble: 1 × 2
## avg_delay flights
## <dbl> <int>
## 1 12.6 336776
# 3.5.6 .by
flights |>
summarize(
delay = mean(dep_delay, na.rm = TRUE),
n = n(),
.by = month # new and experimental
)
## # A tibble: 12 × 3
## month delay n
## <int> <dbl> <int>
## 1 1 10.0 27004
## 2 10 6.24 28889
## 3 11 5.44 27268
## 4 12 16.6 28135
## 5 2 10.8 24951
## 6 3 13.2 28834
## 7 4 13.9 28330
## 8 5 13.0 28796
## 9 6 20.8 28243
## 10 7 21.7 29425
## 11 8 12.6 29327
## 12 9 6.72 27574
# group by multiple variables
flights |>
summarize(
delay = mean(dep_delay, na.rm = TRUE),
n = n(),
.by = c(origin, dest) # .by works with all verbs and has the advantage that you don’t need to use the .groups argument to suppress the grouping message or ungroup() when you’re done.
)
## # A tibble: 224 × 4
## origin dest delay n
## <chr> <chr> <dbl> <int>
## 1 EWR IAH 11.8 3973
## 2 LGA IAH 9.06 2951
## 3 JFK MIA 9.34 3314
## 4 JFK BQN 6.67 599
## 5 LGA ATL 11.4 10263
## 6 EWR ORD 14.6 6100
## 7 EWR FLL 13.5 3793
## 8 LGA IAD 16.7 1803
## 9 JFK MCO 10.6 5464
## 10 LGA ORD 10.7 8857
## # ℹ 214 more rows
flights |>
group_by(carrier, dest) |>
summarize(n())
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
## # A tibble: 314 × 3
## # Groups: carrier [16]
## carrier dest `n()`
## <chr> <chr> <int>
## 1 9E ATL 59
## 2 9E AUS 2
## 3 9E AVL 10
## 4 9E BGR 1
## 5 9E BNA 474
## 6 9E BOS 914
## 7 9E BTV 2
## 8 9E BUF 833
## 9 9E BWI 856
## 10 9E CAE 3
## # ℹ 304 more rows
flights |>
group_by(carrier) |>
summarize(
avg_dep_delay = mean(dep_delay, na.rm = T)
) |>
arrange(desc(avg_dep_delay)) # F9 (Frontier Airlines) has the worst delays
## # A tibble: 16 × 2
## carrier avg_dep_delay
## <chr> <dbl>
## 1 F9 20.2
## 2 EV 20.0
## 3 YV 19.0
## 4 FL 18.7
## 5 WN 17.7
## 6 9E 16.7
## 7 B6 13.0
## 8 VX 12.9
## 9 OO 12.6
## 10 UA 12.1
## 11 MQ 10.6
## 12 DL 9.26
## 13 AA 8.59
## 14 AS 5.80
## 15 HA 4.90
## 16 US 3.78
flights |>
group_by(dest) |>
slice_max(dep_delay, with_ties = F) |>
relocate(dest, dep_delay)
## # A tibble: 105 × 19
## # Groups: dest [105]
## dest dep_delay year month day dep_time sched_dep_time arr_time
## <chr> <dbl> <int> <int> <int> <int> <int> <int>
## 1 ABQ 142 2013 12 14 2223 2001 133
## 2 ACK 219 2013 7 23 1139 800 1250
## 3 ALB 323 2013 1 25 123 2000 229
## 4 ANC 75 2013 8 17 1740 1625 2042
## 5 ATL 898 2013 7 22 2257 759 121
## 6 AUS 351 2013 7 10 2056 1505 2347
## 7 AVL 222 2013 6 14 1158 816 1335
## 8 BDL 252 2013 2 21 1728 1316 1839
## 9 BGR 248 2013 12 1 1504 1056 1628
## 10 BHM 325 2013 4 10 25 1900 136
## # ℹ 95 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
df_dep_time <- flights |>
group_by(hour) |>
summarize(
avg_delay_time = mean(dep_delay, na.rm = T)
)
ggplot(df_dep_time, aes(x = hour, y = avg_delay_time)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
# flights |>
# group_by(dest)
# slice_min(dep_delay, n = -1)
flights |>
count(carrier, sort = T) # roughly equivalent to df %>% group_by(a, b) %>% summarise(n = n()). count()
## # A tibble: 16 × 2
## carrier n
## <chr> <int>
## 1 UA 58665
## 2 B6 54635
## 3 EV 54173
## 4 DL 48110
## 5 AA 32729
## 6 MQ 26397
## 7 US 20536
## 8 9E 18460
## 9 WN 12275
## 10 VX 5162
## 11 FL 3260
## 12 AS 714
## 13 F9 685
## 14 YV 601
## 15 HA 342
## 16 OO 32
df <- tibble(
x = 1:5,
y = c("a", "b", "a", "a", "b"),
z = c("K", "K", "L", "L", "K")
)
df
## # A tibble: 5 × 3
## x y z
## <int> <chr> <chr>
## 1 1 a K
## 2 2 b K
## 3 3 a L
## 4 4 a L
## 5 5 b K
# a, a, a, a, b
df |>
group_by(y) # groups by y , thought output remains the same
## # A tibble: 5 × 3
## # Groups: y [2]
## x y z
## <int> <chr> <chr>
## 1 1 a K
## 2 2 b K
## 3 3 a L
## 4 4 a L
## 5 5 b K
# a, a, a, a, b
df |>
arrange(y)
## # A tibble: 5 × 3
## x y z
## <int> <chr> <chr>
## 1 1 a K
## 2 3 a L
## 3 4 a L
## 4 2 b K
## 5 5 b K
# two rows, mean of all a's, mean of all b's, two columns
df |>
group_by(y) |>
summarize(mean_x = mean(x))
## # A tibble: 2 × 2
## y mean_x
## <chr> <dbl>
## 1 a 2.67
## 2 b 3.5
# group by all combinatoins of y z, so a K, a L, b K
df |>
group_by(y, z) |>
summarize(mean_x = mean(x))
## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups: y [2]
## y z mean_x
## <chr> <chr> <dbl>
## 1 a K 1
## 2 a L 3.5
## 3 b K 3.5
# same\?
df |>
group_by(y, z) |>
summarize(mean_x = mean(x), .groups = "drop") # result is not grouped
## # A tibble: 3 × 3
## y z mean_x
## <chr> <chr> <dbl>
## 1 a K 1
## 2 a L 3.5
## 3 b K 3.5
df |>
group_by(y, z) |>
summarize(mean_x = mean(x)) # show y z and summary mnean_x
## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups: y [2]
## y z mean_x
## <chr> <chr> <dbl>
## 1 a K 1
## 2 a L 3.5
## 3 b K 3.5
df |>
group_by(y, z) |>
mutate(mean_x = mean(x)) # create new column plus all others
## # A tibble: 5 × 4
## # Groups: y, z [3]
## x y z mean_x
## <int> <chr> <chr> <dbl>
## 1 1 a K 1
## 2 2 b K 3.5
## 3 3 a L 3.5
## 4 4 a L 3.5
## 5 5 b K 3.5
batters <- Lahman::Batting |>
group_by(playerID) |>
summarize(
performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
n = sum(AB, na.rm = TRUE)
)
batters
## # A tibble: 20,469 × 3
## playerID performance n
## <chr> <dbl> <int>
## 1 aardsda01 0 4
## 2 aaronha01 0.305 12364
## 3 aaronto01 0.229 944
## 4 aasedo01 0 5
## 5 abadan01 0.0952 21
## 6 abadfe01 0.111 9
## 7 abadijo01 0.224 49
## 8 abbated01 0.254 3044
## 9 abbeybe01 0.169 225
## 10 abbeych01 0.281 1756
## # ℹ 20,459 more rows
batters |>
filter(n > 100) |>
ggplot(aes(x = n, y = performance)) +
geom_point(alpha = 1 / 10) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
batters |>
arrange(desc(performance))
## # A tibble: 20,469 × 3
## playerID performance n
## <chr> <dbl> <int>
## 1 abramge01 1 1
## 2 alberan01 1 1
## 3 banisje01 1 1
## 4 bartocl01 1 1
## 5 bassdo01 1 1
## 6 birasst01 1 2
## 7 bruneju01 1 1
## 8 burnscb01 1 1
## 9 cammaer01 1 1
## 10 campsh01 1 1
## # ℹ 20,459 more rows
Ch 4
library(nycflights13)
flights %>%
filter(dest == "IAH") %>%
group_by(year, month, day) %>%
summarize(
n = n(),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(n > 10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day n delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 20 17.8
## 2 2013 1 2 20 7
## 3 2013 1 3 19 18.3
## 4 2013 1 4 20 -3.2
## 5 2013 1 5 13 20.2
## 6 2013 1 6 18 9.28
## 7 2013 1 7 19 -7.74
## 8 2013 1 8 19 7.79
## 9 2013 1 9 19 18.1
## 10 2013 1 10 19 6.68
## # ℹ 355 more rows
flights %>%
filter(
carrier == "UA",
dest %in% c("IAH", "HOU"),
sched_dep_time > 900,
sched_arr_time < 2000
) %>%
group_by(flight) %>%
summarize(
delay = mean(arr_delay, na.rm = TRUE),
cancelled = sum(is.na(arr_delay)),
n = n()
) %>%
filter(n > 10)
## # A tibble: 74 × 4
## flight delay cancelled n
## <int> <dbl> <int> <int>
## 1 53 12.5 2 18
## 2 112 14.1 0 14
## 3 205 -1.71 0 14
## 4 235 -5.36 0 14
## 5 255 -9.47 0 15
## 6 268 38.6 1 15
## 7 292 6.57 0 21
## 8 318 10.7 1 20
## 9 337 20.1 2 21
## 10 370 17.5 0 11
## # ℹ 64 more rows
Ch 5
billboard
## # A tibble: 317 × 79
## artist track date.entered wk1 wk2 wk3 wk4 wk5 wk6 wk7 wk8
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 Pac Baby… 2000-02-26 87 82 72 77 87 94 99 NA
## 2 2Ge+her The … 2000-09-02 91 87 92 NA NA NA NA NA
## 3 3 Doors D… Kryp… 2000-04-08 81 70 68 67 66 57 54 53
## 4 3 Doors D… Loser 2000-10-21 76 76 72 69 67 65 55 59
## 5 504 Boyz Wobb… 2000-04-15 57 34 25 17 17 31 36 49
## 6 98^0 Give… 2000-08-19 51 39 34 26 26 19 2 2
## 7 A*Teens Danc… 2000-07-08 97 97 96 95 100 NA NA NA
## 8 Aaliyah I Do… 2000-01-29 84 62 51 41 38 35 35 38
## 9 Aaliyah Try … 2000-03-18 59 53 38 28 21 18 16 14
## 10 Adams, Yo… Open… 2000-08-26 76 76 74 69 68 67 61 58
## # ℹ 307 more rows
## # ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
## # wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
## # wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
## # wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
## # wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
## # wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, …
billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank"
)
## # A tibble: 24,092 × 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
## 8 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk8 NA
## 9 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk9 NA
## 10 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk10 NA
## # ℹ 24,082 more rows
billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
)
## # A tibble: 5,307 × 5
## artist track date.entered week rank
## <chr> <chr> <date> <chr> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
## 8 2Ge+her The Hardest Part Of ... 2000-09-02 wk1 91
## 9 2Ge+her The Hardest Part Of ... 2000-09-02 wk2 87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02 wk3 92
## # ℹ 5,297 more rows
billboard_longer <- billboard |>
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
values_to = "rank",
values_drop_na = TRUE
) |>
mutate(
week = parse_number(week)
)
billboard_longer
## # A tibble: 5,307 × 5
## artist track date.entered week rank
## <chr> <chr> <date> <dbl> <dbl>
## 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87
## 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 2 82
## 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 3 72
## 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 4 77
## 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 5 87
## 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 6 94
## 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 7 99
## 8 2Ge+her The Hardest Part Of ... 2000-09-02 1 91
## 9 2Ge+her The Hardest Part Of ... 2000-09-02 2 87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02 3 92
## # ℹ 5,297 more rows
billboard_longer |>
ggplot(aes(x = week, y = rank, group = track)) +
geom_line(alpha = 0.25) +
scale_y_reverse()
df <- tribble(
~id, ~bp1, ~bp2,
"A", 100, 120,
"B", 140, 115,
"C", 120, 125
)
df |>
pivot_longer(
cols = bp1:bp2,
names_to = "measurement",
values_to = "value"
)
## # A tibble: 6 × 3
## id measurement value
## <chr> <chr> <dbl>
## 1 A bp1 100
## 2 A bp2 120
## 3 B bp1 140
## 4 B bp2 115
## 5 C bp1 120
## 6 C bp2 125
?who2
who2 |>
pivot_longer(
cols = !(country:year),
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
)
## # A tibble: 405,440 × 6
## country year diagnosis gender age count
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1980 sp m 014 NA
## 2 Afghanistan 1980 sp m 1524 NA
## 3 Afghanistan 1980 sp m 2534 NA
## 4 Afghanistan 1980 sp m 3544 NA
## 5 Afghanistan 1980 sp m 4554 NA
## 6 Afghanistan 1980 sp m 5564 NA
## 7 Afghanistan 1980 sp m 65 NA
## 8 Afghanistan 1980 sp f 014 NA
## 9 Afghanistan 1980 sp f 1524 NA
## 10 Afghanistan 1980 sp f 2534 NA
## # ℹ 405,430 more rows
who2 |>
pivot_longer(
cols = !(country:year),
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
)
## # A tibble: 405,440 × 6
## country year diagnosis gender age count
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1980 sp m 014 NA
## 2 Afghanistan 1980 sp m 1524 NA
## 3 Afghanistan 1980 sp m 2534 NA
## 4 Afghanistan 1980 sp m 3544 NA
## 5 Afghanistan 1980 sp m 4554 NA
## 6 Afghanistan 1980 sp m 5564 NA
## 7 Afghanistan 1980 sp m 65 NA
## 8 Afghanistan 1980 sp f 014 NA
## 9 Afghanistan 1980 sp f 1524 NA
## 10 Afghanistan 1980 sp f 2534 NA
## # ℹ 405,430 more rows
df <- tribble(
~id, ~bp1, ~bp2,
"A", 100, 120,
"B", 140, 115,
"C", 120, 125
)
df |> pivot_longer(
cols = bp1:bp2,
names_to = "measurement",
values_to = "amount"
)
## # A tibble: 6 × 3
## id measurement amount
## <chr> <chr> <dbl>
## 1 A bp1 100
## 2 A bp2 120
## 3 B bp1 140
## 4 B bp2 115
## 5 C bp1 120
## 6 C bp2 125
household
## # A tibble: 5 × 5
## family dob_child1 dob_child2 name_child1 name_child2
## <int> <date> <date> <chr> <chr>
## 1 1 1998-11-26 2000-01-29 Susan Jose
## 2 2 1996-06-22 NA Mark <NA>
## 3 3 2002-07-11 2004-04-05 Sam Seth
## 4 4 2004-10-10 2009-08-27 Craig Khai
## 5 5 2000-12-05 2005-02-28 Parker Gracie
?household
household |>
pivot_longer(
cols = !family,
names_to = c(".value", "child"),
names_sep = "_",
values_drop_na = T,
values_to = "name"
)
## # A tibble: 9 × 4
## family child dob name
## <int> <chr> <date> <chr>
## 1 1 child1 1998-11-26 Susan
## 2 1 child2 2000-01-29 Jose
## 3 2 child1 1996-06-22 Mark
## 4 3 child1 2002-07-11 Sam
## 5 3 child2 2004-04-05 Seth
## 6 4 child1 2004-10-10 Craig
## 7 4 child2 2009-08-27 Khai
## 8 5 child1 2000-12-05 Parker
## 9 5 child2 2005-02-28 Gracie
cms_patient_experience |>
distinct(measure_cd, measure_title)
## # A tibble: 6 × 2
## measure_cd measure_title
## <chr> <chr>
## 1 CAHPS_GRP_1 CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
## 2 CAHPS_GRP_2 CAHPS for MIPS SSM: How Well Providers Communicate
## 3 CAHPS_GRP_3 CAHPS for MIPS SSM: Patient's Rating of Provider
## 4 CAHPS_GRP_5 CAHPS for MIPS SSM: Health Promotion and Education
## 5 CAHPS_GRP_8 CAHPS for MIPS SSM: Courteous and Helpful Office Staff
## 6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources
cms_patient_experience |>
pivot_wider(
id_cols = starts_with("org"),
names_from = measure_cd,
values_from = prf_rate
)
## # A tibble: 95 × 8
## org_pac_id org_nm CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3 CAHPS_GRP_5 CAHPS_GRP_8
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0446157747 USC C… 63 87 86 57 85
## 2 0446162697 ASSOC… 59 85 83 63 88
## 3 0547164295 BEAVE… 49 NA 75 44 73
## 4 0749333730 CAPE … 67 84 85 65 82
## 5 0840104360 ALLIA… 66 87 87 64 87
## 6 0840109864 REX H… 73 87 84 67 91
## 7 0840513552 SCL H… 58 83 76 58 78
## 8 0941545784 GRITM… 46 86 81 54 NA
## 9 1052612785 COMMU… 65 84 80 58 87
## 10 1254237779 OUR L… 61 NA NA 65 NA
## # ℹ 85 more rows
## # ℹ 1 more variable: CAHPS_GRP_12 <dbl>
cms_patient_experience
## # A tibble: 500 × 5
## org_pac_id org_nm measure_cd measure_title prf_rate
## <chr> <chr> <chr> <chr> <dbl>
## 1 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 63
## 2 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 87
## 3 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 86
## 4 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 57
## 5 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 85
## 6 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 24
## 7 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 59
## 8 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 85
## 9 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 83
## 10 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 63
## # ℹ 490 more rows
df <- tribble(
~id, ~measurement, ~value,
"A", "bp1", 100,
"B", "bp1", 140,
"B", "bp2", 115,
"A", "bp2", 120,
"A", "bp3", 105
)
df |> pivot_wider(
id_cols = "id", # optional in this case
names_from = measurement,
values_from = value,
)
## # A tibble: 2 × 4
## id bp1 bp2 bp3
## <chr> <dbl> <dbl> <dbl>
## 1 A 100 120 105
## 2 B 140 115 NA
df |>
distinct(measurement) |>
pull() # extra column
## [1] "bp1" "bp2" "bp3"
df |>
select(-measurement, -value) |> # all but measurement, value
distinct() # A B
## # A tibble: 2 × 1
## id
## <chr>
## 1 A
## 2 B
df |>
select(-measurement, -value) |>
distinct() |>
mutate(x = NA, y = NA, z = NA)
## # A tibble: 2 × 4
## id x y z
## <chr> <lgl> <lgl> <lgl>
## 1 A NA NA NA
## 2 B NA NA NA
df <- tribble(
~id, ~measurement, ~value,
"A", "bp1", 100,
"A", "bp1", 102,
"A", "bp2", 120,
"B", "bp1", 140,
"B", "bp2", 115
)
df |>
summarise(
n = n(),
.by = c(id, measurement)
) |>
filter(n > 1)
## # A tibble: 1 × 3
## id measurement n
## <chr> <chr> <int>
## 1 A bp1 2
df |>
# check to see if there are repeat combinations of ID and measurement and, if so, remove the repeat
distinct(id, measurement, .keep_all = TRUE) |>
pivot_wider(
id_cols = id,
names_from = measurement,
values_from = value
)
## # A tibble: 2 × 3
## id bp1 bp2
## <chr> <dbl> <dbl>
## 1 A 100 120
## 2 B 140 115
Ch 6
library(dplyr)
library(nycflights13)
not_cancelled <- flights |>
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled |>
group_by(year, month, day) |>
summarize(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.4
## 2 2013 1 2 13.7
## 3 2013 1 3 10.9
## 4 2013 1 4 8.97
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.56
## 9 2013 1 9 2.30
## 10 2013 1 10 2.84
## # ℹ 355 more rows
getwd()
## [1] "/Users/danielstafford/Coding/Tutorials/r4ds"
# always use relative paths! 👀
# Edit a DF interactively!
library(DataEditR)
# mtcars_new <- data_edit(mtcars,
# save_as = "mtcars_new.csv"
# )
library(ViewPipeSteps)
diamonds
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ℹ 53,930 more rows
# View pipe steps!
# diamonds %>%
# select(carat, cut, color, clarity, price) %>%
# group_by(color) %>%
# summarise(n = n(), price = mean(price)) %>%
# arrange(desc(color)) %>%
# print_pipe_steps()
Ch 7
students <- read_csv("https://pos.it/r4ds-students-csv")
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Full Name, favourite.food, mealPlan, AGE
## dbl (1): Student ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 6
## Columns: 5
## $ `Student ID` <dbl> 1, 2, 3, 4, 5, 6
## $ `Full Name` <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", "N/A", "Anchovies…
## $ mealPlan <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE <chr> "4", "5", "7", NA, "five", "6"
students <- read_csv("https://pos.it/r4ds-students-csv", na = c("N/A", "")) # capture both empty and N/A strings
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Full Name, favourite.food, mealPlan, AGE
## dbl (1): Student ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 6
## Columns: 5
## $ `Student ID` <dbl> 1, 2, 3, 4, 5, 6
## $ `Full Name` <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ mealPlan <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE <chr> "4", "5", "7", NA, "five", "6"
# columns names annoying have backticks, remove them
students <- rename(
students,
student_id = `Student ID`,
full_name = `Full Name`
)
glimpse(students)
## Rows: 6
## Columns: 5
## $ student_id <dbl> 1, 2, 3, 4, 5, 6
## $ full_name <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ mealPlan <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE <chr> "4", "5", "7", NA, "five", "6"
# alterntive automatic method
library(janitor)
students <- read_csv("https://pos.it/r4ds-students-csv", na = c("N/A", ""))
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Full Name, favourite.food, mealPlan, AGE
## dbl (1): Student ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 6
## Columns: 5
## $ `Student ID` <dbl> 1, 2, 3, 4, 5, 6
## $ `Full Name` <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ mealPlan <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE <chr> "4", "5", "7", NA, "five", "6"
students |> janitor::clean_names()
## # A tibble: 6 × 5
## student_id full_name favourite_food meal_plan age
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4
## 2 2 Barclay Lynn French fries Lunch only 5
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7
## 4 4 Leon Rossini Anchovies Lunch only <NA>
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch five
## 6 6 Güvenç Attila Ice cream Lunch only 6
# change cats to factors, fix age
students <- students |>
janitor::clean_names() |>
mutate(
meal_plan = factor(meal_plan), # changes from chr to fct
age = parse_number(if_else(age == "five", "5", age)) # if age is "five", turn to "5" otherwise leave it along, parse all age strings as numbers
)
glimpse(students)
## Rows: 6
## Columns: 5
## $ student_id <dbl> 1, 2, 3, 4, 5, 6
## $ full_name <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite_food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ meal_plan <fct> Lunch only, Lunch only, Breakfast and lunch, Lunch only…
## $ age <dbl> 4, 5, 7, NA, 5, 6
read_csv(
"a,b,c
1,2,3
4,5,6"
)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
# skip first two rows of csv.
read_csv(
"The first line of metadata
The second line of metadata
x,y,z
1,2,3",
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): The first line of metadata
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 3 × 1
## `The first line of metadata`
## <chr>
## 1 The second line of metadata
## 2 x,y,z
## 3 1,2,3
read_csv(
"The first line of metadata
The second line of metadata
x,y,z
1,2,3",
skip = 2
)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
# ignore specific lines
read_csv(
"/ A comment I want to skip
x,y,z
1,2,3",
comment = "/"
)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
# no col names
read_csv(
"1,2,3
4,5,6",
col_names = FALSE # creates col names
)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): X1, X2, X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
# custom col names
read_csv(
"1,2,3
4,5,6",
col_names = c("x", "y", "z")
)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
# semicolon: csv2, tab: tsv, delim: guess, fwf: fixed width, tableL white space/fixed width, log: apache log
# delim
# na, trim_ws, etc.
# read_fwf() fwf_empty() - Guesses based on the positions of empty columns. fwf_widths() - Supply the widths of the columns. fwf_positions() - Supply paired vectors of start and end positions. fwf_cols() - Supply named arguments of paired start and end positions or column widths.
read_csv("x,y\n1,'a,b'", quote = "'")
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): y
## dbl (1): x
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
## x y
## <dbl> <chr>
## 1 1 a,b
read_csv("a,b,c\n1,2,3\n4,5,6") # was missing a column
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
annoying <- tibble(
`1` = 1:10,
`2` = `1` * 2 + rnorm(length(`1`))
)
annoying
## # A tibble: 10 × 2
## `1` `2`
## <int> <dbl>
## 1 1 2.05
## 2 2 4.02
## 3 3 6.20
## 4 4 8.19
## 5 5 9.84
## 6 6 13.0
## 7 7 13.8
## 8 8 16.9
## 9 9 18.1
## 10 10 18.5
getOne <- annoying |>
select("1")
# Extracting the variable labeled as '1'
annoying |>
pull(`1`)
## [1] 1 2 3 4 5 6 7 8 9 10
# scatterplot one vs. two
annoying |>
ggplot(aes(x = `2`, y = `1`)) +
geom_point()
# Creating a new column called 3, which is 2 divided by 1.
annoying <- annoying |>
mutate(
`3` = `2` / `1`
)
# Renaming the columns to one, two, and three
annoying |>
rename("one" = `1`, "two" = `2`, "three" = `3`)
## # A tibble: 10 × 3
## one two three
## <int> <dbl> <dbl>
## 1 1 2.05 2.05
## 2 2 4.02 2.01
## 3 3 6.20 2.07
## 4 4 8.19 2.05
## 5 5 9.84 1.97
## 6 6 13.0 2.16
## 7 7 13.8 1.98
## 8 8 16.9 2.11
## 9 9 18.1 2.02
## 10 10 18.5 1.85
read_csv("
logical,numeric,date,string
TRUE,1,2021-01-15,abc
false,4.5,2021-02-15,def
T,Inf,2021-02-16,ghi
")
## Rows: 3 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): string
## dbl (1): numeric
## lgl (1): logical
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 3 × 4
## logical numeric date string
## <lgl> <dbl> <date> <chr>
## 1 TRUE 1 2021-01-15 abc
## 2 FALSE 4.5 2021-02-15 def
## 3 TRUE Inf 2021-02-16 ghi
another_csv <- "
x,y,z
1,2,3"
read_csv(
another_csv,
col_types = cols(.default = col_character())
)
## # A tibble: 1 × 3
## x y z
## <chr> <chr> <chr>
## 1 1 2 3
sales_files <- c(
"https://pos.it/r4ds-01-sales",
"https://pos.it/r4ds-02-sales",
"https://pos.it/r4ds-03-sales"
)
read_csv(sales_files, id = "file") # id argument adds a new column called file to the resulting data frame that identifies the file the data come from.
## Rows: 19 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (4): year, brand, item, n
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 19 × 6
## file month year brand item n
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 https://pos.it/r4ds-01-sales January 2019 1 1234 3
## 2 https://pos.it/r4ds-01-sales January 2019 1 8721 9
## 3 https://pos.it/r4ds-01-sales January 2019 1 1822 2
## 4 https://pos.it/r4ds-01-sales January 2019 2 3333 1
## 5 https://pos.it/r4ds-01-sales January 2019 2 2156 9
## 6 https://pos.it/r4ds-01-sales January 2019 2 3987 6
## 7 https://pos.it/r4ds-01-sales January 2019 2 3827 6
## 8 https://pos.it/r4ds-02-sales February 2019 1 1234 8
## 9 https://pos.it/r4ds-02-sales February 2019 1 8721 2
## 10 https://pos.it/r4ds-02-sales February 2019 1 1822 3
## 11 https://pos.it/r4ds-02-sales February 2019 2 3333 1
## 12 https://pos.it/r4ds-02-sales February 2019 2 2156 3
## 13 https://pos.it/r4ds-02-sales February 2019 2 3987 6
## 14 https://pos.it/r4ds-03-sales March 2019 1 1234 3
## 15 https://pos.it/r4ds-03-sales March 2019 1 3627 1
## 16 https://pos.it/r4ds-03-sales March 2019 1 8820 3
## 17 https://pos.it/r4ds-03-sales March 2019 2 7253 1
## 18 https://pos.it/r4ds-03-sales March 2019 2 8766 3
## 19 https://pos.it/r4ds-03-sales March 2019 2 8288 6
sales_files <- list.files("data", pattern = "sales\\.csv$", full.names = TRUE)
sales_files
## [1] "data/01-sales.csv" "data/02-sales.csv" "data/03-sales.csv"
students
## # A tibble: 6 × 5
## student_id full_name favourite_food meal_plan age
## <dbl> <chr> <chr> <fct> <dbl>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4
## 2 2 Barclay Lynn French fries Lunch only 5
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7
## 4 4 Leon Rossini Anchovies Lunch only NA
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5
## 6 6 Güvenç Attila Ice cream Lunch only 6
write_csv(students, "data/students-2.csv")
read_csv("data/students-2.csv") # note that we lose col type for meal_plan
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): full_name, favourite_food, meal_plan
## dbl (2): student_id, age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 6 × 5
## student_id full_name favourite_food meal_plan age
## <dbl> <chr> <chr> <chr> <dbl>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4
## 2 2 Barclay Lynn French fries Lunch only 5
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7
## 4 4 Leon Rossini Anchovies Lunch only NA
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5
## 6 6 Güvenç Attila Ice cream Lunch only 6
# custom R's binary RDS
write_rds(students, "data/students.rds")
read_rds("data/students.rds")
## # A tibble: 6 × 5
## student_id full_name favourite_food meal_plan age
## <dbl> <chr> <chr> <fct> <dbl>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4
## 2 2 Barclay Lynn French fries Lunch only 5
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7
## 4 4 Leon Rossini Anchovies Lunch only NA
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5
## 6 6 Güvenç Attila Ice cream Lunch only 6
# arrow binary for many languages
library(arrow)
## Warning: package 'arrow' was built under R version 4.2.3
## Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
##
## Attaching package: 'arrow'
##
## The following object is masked from 'package:lubridate':
##
## duration
##
## The following object is masked from 'package:utils':
##
## timestamp
students
## # A tibble: 6 × 5
## student_id full_name favourite_food meal_plan age
## <dbl> <chr> <chr> <fct> <dbl>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4
## 2 2 Barclay Lynn French fries Lunch only 5
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7
## 4 4 Leon Rossini Anchovies Lunch only NA
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5
## 6 6 Güvenç Attila Ice cream Lunch only 6
# write_parquet(students, "students.parquet")
# read_parquet("students.parquet")
# by column which is a bit weird
tibble(
x = c(1, 2, 5),
y = c("h", "m", "g"),
z = c(0.08, 0.83, 0.60)
)
## # A tibble: 3 × 3
## x y z
## <dbl> <chr> <dbl>
## 1 1 h 0.08
## 2 2 m 0.83
## 3 5 g 0.6
# by row which is easier
tribble(
~x, ~y, ~z,
1, "h", 0.08,
2, "m", 0.83,
5, "g", 0.60
)
## # A tibble: 3 × 3
## x y z
## <dbl> <chr> <dbl>
## 1 1 h 0.08
## 2 2 m 0.83
## 3 5 g 0.6
Ch 8
y <- 1:4
mean(y)
## [1] 2.5
dput(mtcars)
## structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
## 24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
## 30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
## 19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
## 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
## disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
## 167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
## 71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
## 301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
## 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
## 150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
## 3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
## 3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
## 3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
## ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
## 3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
## 1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
## 1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
## 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
## 18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
## 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
## ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
## 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
## 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
## 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
## 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
## 3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
## 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
## 2, 2, 4, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag",
## "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
## "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
## "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
## "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
## "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
## "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
## "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
## "Volvo 142E"), class = "data.frame")
Ch 9
mpg
## # A tibble: 234 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
## # ℹ 224 more rows
mpg |>
ggplot(aes(x = displ, y = hwy, color = class)) +
geom_point()
mpg |>
ggplot(aes(x = displ, y = hwy, size = class)) +
geom_point()
## Warning: Using size for a discrete variable is not advised.
mpg |>
ggplot(aes(x = displ, y = hwy, alpha = class)) +
geom_point()
## Warning: Using alpha for a discrete variable is not advised.
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue")
# Create a scatterplot of hwy vs. displ where the points are pink filled in triangles.
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(color = "pink", shape = 17)
# Why did the following code not result in a plot with blue points?
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy, color = "blue")) # aes should be defined outisde of aes
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy), color = "blue")
# what does the stroke aes do ?
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy), stroke = 1) # adjust size, thickness
mpg |> ggplot(aes(x = displ, y = hwy, color = displ < 5)) +
geom_point() # true / false with diff colors
ggplot(mpg, aes(x = displ, y = hwy, shape = drv)) +
geom_point()
ggplot(mpg, aes(x = displ, y = hwy, linetype = drv)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
geom_smooth(aes(linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Left
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Middle
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(group = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Right
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(color = drv), show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_point(
data = mpg |> filter(class == "2seater"),
color = "red"
) +
geom_point(
data = mpg |> filter(class == "2seater"),
shape = "circle open", size = 3, color = "red"
)
# Left
ggplot(mpg, aes(x = hwy)) +
geom_histogram(binwidth = 2)
# Middle
ggplot(mpg, aes(x = hwy)) +
geom_density()
# Right
ggplot(mpg, aes(x = hwy)) +
geom_boxplot()
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.2.3
ggplot(mpg, aes(x = hwy, y = drv, fill = drv, color = drv)) +
geom_density_ridges(alpha = 0.5, show.legend = FALSE)
## Picking joint bandwidth of 1.28
# What geom would you use to draw a line chart? A boxplot? A histogram? An area chart?
# linear chart - gemo_point
# boxplot - geom_box
# histogram - geom_histogram
# area chart - geom_area
# Earlier in this chapter we used show.legend without explaining it:
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(color = drv), show.legend = F)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(color = drv), show.legend = F, se = F)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# se = Display confidence interval around smooth? (TRUE by default, see level to control.)
# hides legend, more space
# Recreate the R code necessary to generate the following graphs. Note that wherever a categorical variable is used in the plot, it’s drv
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(stroke = 2) +
geom_smooth(se = F)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, )) +
geom_point(stroke = 2) +
geom_smooth(se = F, aes(group = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point(stroke = 2) +
geom_smooth(se = F, aes(color = drv), show.legend = T)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(stroke = 2, aes(color = drv)) +
geom_smooth(se = F, show.legend = T, aes(linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(shape = 16, size = 7, color = "white") +
geom_point(aes(color = drv, stroke = 2))
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~cyl)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl, scales = "free_y") # different scales in columns, helps visualize better
# What happens if you facet on a continuous variable?
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~cty) # with cont. var, get sum of unique values
mpg |> count(cty)
## # A tibble: 21 × 2
## cty n
## <int> <int>
## 1 9 5
## 2 11 20
## 3 12 8
## 4 13 21
## 5 14 19
## 6 15 24
## 7 16 19
## 8 17 16
## 9 18 26
## 10 19 20
## # ℹ 11 more rows
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl)
# What do the empty cells in the plot above with facet_grid(drv ~ cyl) mean? Run the following code. How do they relate to the resulting plot?
ggplot(mpg) +
geom_point(aes(x = drv, y = cyl))
# certain combination do not exsist, for instance , there are no rear wheel drive train with 1 cylinder
mpg %>%
filter(drv == "r" & cyl == 4)
## # A tibble: 0 × 11
## # ℹ 11 variables: manufacturer <chr>, model <chr>, displ <dbl>, year <int>,
## # cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>, fl <chr>,
## # class <chr>
# What plots does the following code make? What does . do?
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(drv ~ .)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(. ~ cyl)
# Take the first faceted plot in this section:
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~class, nrow = 2)
# What are the advantages to using faceting instead of the color aesthetic? What are the disadvantages? How might the balance change if you had a larger dataset?
# facets allow for different scales (y free) and alllow for more breathing spaces. Larger data sets might mean too many grids though.
# Read ?facet_wrap. What does nrow do? What does ncol do? What other options control the layout of the individual panels? Why doesn’t facet_grid() have nrow and ncol arguments?
?facet_wrap
# nrow, ncol Number of rows and columns.
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~class, nrow = 4) # can make more vertical orientation, for instance, grid won't have these options as they are already set
# Which of the following plots makes it easier to compare engine size (displ) across cars with different drive trains? What does this say about when to place a faceting variable across rows or columns?
ggplot(mpg, aes(x = displ)) +
geom_histogram() +
facet_grid(drv ~ .) # by rows (much better)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mpg, aes(x = displ)) +
geom_histogram() +
facet_grid(. ~ drv) # columns compete with x=displ
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Recreate the following plot using facet_wrap() instead of facet_grid(). How do the positions of the facet labels change?
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(drv ~ .)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(drv ~ ., switch = "y") # move label to otherside
ggplot(diamonds, aes(x = cut)) +
geom_bar()
?geom_bar
diamonds |>
count(cut) |>
ggplot(aes(x = cut, y = n)) +
geom_bar(stat = "identity")
ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
geom_bar()
ggplot(diamonds) +
stat_summary(
aes(x = cut, y = depth),
fun.min = min,
fun.max = max,
fun = median
)
# What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function?
# Uses "geom = pointrange" by default
diamonds |>
group_by(cut) |>
summarize(
lower = min(depth),
upper = max(depth),
midpoint = median(depth)
) |>
ggplot(aes(x = cut, y = midpoint)) +
geom_pointrange(aes(ymin = lower, ymax = upper))
# What does geom_col() do? How is it different from geom_bar()?
# geom_col represents values in the data, why geom_bar uses counts
ggplot(diamonds, aes(x = cut)) +
geom_bar()
ggplot(diamonds, aes(x = cut, y = depth)) +
geom_col()
# Most geoms and stats come in pairs that are almost always used in concert. Make a list of all the pairs. What do they have in common? (Hint: Read through the documentation.)
# geom stat
# geom_bar() stat_count()
# geom_bin2d() stat_bin_2d()
# geom_boxplot() stat_boxplot()
# geom_contour_filled() stat_contour_filled()
# geom_contour() stat_contour()
# geom_count() stat_sum()
# geom_density_2d() stat_density_2d()
# geom_density() stat_density()
# geom_dotplot() stat_bindot()
# geom_function() stat_function()
# geom_sf() stat_sf()
# geom_sf() stat_sf()
# geom_smooth() stat_smooth()
# geom_violin() stat_ydensity()
# geom_hex() stat_bin_hex()
# geom_qq_line() stat_qq_line()
# geom_qq() stat_qq()
# geom_quantile() stat_quantile()
# What variables does stat_smooth() compute? What arguments control its behavior?
?stat_smooth
# predicted value, lower CI from mean, upper CI from mean, and SE
# In our proportion bar chart, we needed to set group = 1. Why? In other words, what is the problem with these two graphs?
# In the first pair of plots, we see that setting group = 1 results in the marginal proportions of cuts being plotted. In the second pair of plots, setting group = color results in the proportions of colors within each cut being plotted.
# one variable
ggplot(diamonds, aes(x = cut, y = after_stat(prop))) +
geom_bar()
ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
geom_bar() # after_stat(prop) represents the proportion of each category. The group = 1 argument is used to ensure the proportion are grouped
# two variables
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) +
geom_bar()
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = color)) +
geom_bar()
# Color
ggplot(mpg, aes(x = drv, color = drv)) +
geom_bar()
# Fill
ggplot(mpg, aes(x = drv, fill = drv)) +
geom_bar()
# fill with another class other than x
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar()
# identity
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar(alpha = 1 / 5, position = "identity") # create overlap, so use alpha for transparency, The identity position adjustment is more useful for 2d geoms, like points, where it is the default.
# transparent
ggplot(mpg, aes(x = drv, color = class)) +
geom_bar(fill = NA, position = "identity") # completely transparent by setting fill = NA.
# fill
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar(position = "fill") # "fill" works like stacking, but makes each set of stacked bars the same height. This makes it easier to compare proportions across groups.
# dodge
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar(position = "dodge") # "dodge" places overlapping objects directly beside one another. This makes it easier to compare individual values.
# fixing overplotting (one plot containg many values)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(position = "jitter") # adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise.
# shorthand for jitter
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_jitter()
# What is the problem with the following plot? How could you improve it?
ggplot(mpg, aes(x = cty, y = hwy)) +
geom_point()
# jitter it to avoid overplotting
ggplot(mpg, aes(x = cty, y = hwy)) +
geom_jitter()
# What, if anything, is the difference between the two plots? Why?
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point()
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(position = "identity")
# no differene, identity is default
# What parameters to geom_jitter() control the amount of jittering?
?geom_jitter # width, height, defaults to .4. this means the jitter values will occupy 80% (twice value of .4) of the implied bins. Categorical data is aligned on the integers, so a width or height of 0.5 will spread the data so it's not possible to see the distinction between the categories.
ggplot(mpg, aes(x = cty, y = hwy)) +
geom_jitter(width = .8, height = .8)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_jitter()
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_count() # size by overlap
# What’s the default position adjustment for geom_boxplot()? Create a visualization of the mpg dataset that demonstrates it.
?geom_boxplot # position = "dodge2"
ggplot(mpg, aes(x = cty, y = displ)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(mpg, aes(x = cty, y = displ)) +
geom_boxplot(position = "dodge2")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
nz <- map_data("nz")
ggplot(nz, aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "white", color = "black")
ggplot(nz, aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "white", color = "black") +
coord_quickmap() # sets the aspect ratio correctly for geographic maps.
bar <- ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = clarity, fill = clarity),
show.legend = FALSE,
width = 1
) +
theme(aspect.ratio = 1)
bar
bar + coord_flip() # flipped
bar + coord_polar() # bar/Coxcomb
# Turn a stacked bar chart into a pie chart using coord_polar().
mpg |> ggplot(aes(x = "", fill = class)) + # stacked bar
geom_bar() +
coord_polar(theta = "y") # pie chart, theta ariable to map angle to (x or y), default = x
# What’s the difference between coord_quickmap() and coord_map()?
?coord_quickmap # quickmap: approximate projection, good for countries near equator, map: more computation, as no straight lines
# What does the following plot tell you about the relationship between city and highway mpg? Why is coord_fixed() important? What does geom_abline() do?
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point() +
geom_abline() + # diagonal reference line where highway = city mileage, shows hwy mileage always higher than city
coord_fixed() # 1 unit on y axis is same length as 1 unit on x axis,
# ggplot(data = <DATA>) +
# <GEOM_FUNCTION>(
# mapping = aes(<MAPPINGS>),
# stat = <STAT>,
# position = <POSITION>
# ) +
# <COORDINATE_FUNCTION> +
# <FACET_FUNCTION>
Ch. 10: Exploratory data analysis
ggplot(diamonds, aes(x = carat)) +
geom_histogram(binwidth = 0.5)
smaller <- diamonds |>
filter(carat < 3)
ggplot(smaller, aes(x = carat)) +
geom_histogram(binwidth = 0.01)
glimpse(smaller)
## Rows: 53,900
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
ggplot(diamonds, aes(x = y)) +
geom_histogram(binwidth = 0.5)
ggplot(diamonds, aes(x = y)) +
geom_histogram(binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50)) # zoom to small values of the y-axis with coord_cartesian(): coord_cartesian() also has an xlim() argument for when you need to zoom into the x-axis. ggplot2 also has xlim() and ylim() functions that work slightly differently: they throw away the data outside the limits.
unusual <- diamonds |>
filter(y < 3 | y > 20) |>
select(price, x, y, z) |>
arrange(y)
unusual
## # A tibble: 9 × 4
## price x y z
## <int> <dbl> <dbl> <dbl>
## 1 5139 0 0 0
## 2 6381 0 0 0
## 3 12800 0 0 0
## 4 15686 0 0 0
## 5 18034 0 0 0
## 6 2130 0 0 0
## 7 2130 0 0 0
## 8 2075 5.15 31.8 5.12
## 9 12210 8.09 58.9 8.06
# Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.
ggplot(diamonds, aes(x = x)) +
geom_histogram(binwidxth = 0.5)
## Warning in geom_histogram(binwidxth = 0.5): Ignoring unknown parameters:
## `binwidxth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds, aes(x = y)) +
geom_histogram(binwidth = 0.5)
ggplot(diamonds, aes(x = z)) +
geom_histogram(binwidth = 0.5)
summary(diamonds$x) # 10.7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 4.710 5.700 5.731 6.540 10.740
summary(diamonds$y) # 58.9
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 4.720 5.710 5.735 6.540 58.900
summary(diamonds$z) # 31
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.910 3.530 3.539 4.040 31.800
# Appears to relate to vertical orientation
# Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram(binwidth = 3)
# How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?
diamonds |>
filter(carat == 0.99) |>
count() # 23
## # A tibble: 1 × 1
## n
## <int>
## 1 23
diamonds |>
filter(carat == 1) |>
count() # 1538
## # A tibble: 1 × 1
## n
## <int>
## 1 1558
# Compare and contrast coord_cartesian() vs. xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram(binwidth = 3, )
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram(binwidth = 3) +
xlim(0, 10000) +
ylim(0, 100) # no plotting beyond limits
## Warning: Removed 5222 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 153 rows containing missing values or values outside the scale range
## (`geom_bar()`).
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram(binwidth = 3) +
coord_cartesian(xlim = c(0, 10000), ylim = c(0, 100)) # still plots data beyond limits
# drop the row with strange values, not recommended
diamonds2 <- diamonds |>
filter(between(y, 3, 20))
# better to replace with missing values
diamonds2 <- diamonds |>
mutate(y = if_else(y < 3 | y > 20, NA, y))
# ggoplot will warn they've been removed
ggplot(diamonds2, aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).
# to supress the warning
ggplot(diamonds2, aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
nycflights13::flights |>
mutate(
cancelled = is.na(dep_time), # NA = flight canceled, so plot!
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + (sched_min / 60)
) |>
ggplot(aes(x = sched_dep_time)) +
geom_freqpoly(aes(color = cancelled), binwidth = 1 / 4)
# However this plot isn’t great because there are many more non-cancelled flights than cancelled flights.
# What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference in how missing values are handled in histograms and bar charts?
diamonds2 |> ggplot(aes(x = y)) +
geom_boxplot() # missing values ignored
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
diamonds2 |> ggplot(aes(x = y)) +
geom_bar() # appears to allow ignore?
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_count()`).
# What does na.rm = TRUE do in mean() and sum()?
diamonds2 |> summarise(
ymean = mean(y) # not remove, NA will result
)
## # A tibble: 1 × 1
## ymean
## <dbl>
## 1 NA
diamonds2 |> summarise(
ymean = mean(y, na.rm = T) # now works
)
## # A tibble: 1 × 1
## ymean
## <dbl>
## 1 5.73
diamonds2 |> summarise(
ymean = sum(y) # not removed, NA will result
)
## # A tibble: 1 × 1
## ymean
## <dbl>
## 1 NA
diamonds2 |> summarise(
ymean = sum(y, na.rm = T) # now works
)
## # A tibble: 1 × 1
## ymean
## <dbl>
## 1 309230.
# Recreate the frequency plot of scheduled_dep_time colored by whether the flight was cancelled or not. Also facet by the cancelled variable. Experiment with different values of the scales variable in the faceting function to mitigate the effect of more non-cancelled flights than cancelled flights.
nycflights13::flights |>
mutate(
cancelled = is.na(dep_time)
) |>
ggplot(aes(x = sched_dep_time)) +
geom_freqpoly(aes(color = cancelled), binwidth = 1 / 4) +
facet_wrap(~cancelled, scales = "free_y") # zoom in on y range
ggplot(diamonds, aes(x = price)) +
geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75) # default appearance of geom_freqpoly() is not that useful here because the height, determined by the overall count, differs so much across cuts, making it hard to see the differences in the shapes of their distributions.
# we’ll display the density, which is the count standardized so that the area under each frequency polygon is one. Note that we’re mapping the density to y, but since density is not a variable in the diamonds dataset, we need to first calculate it. We use the after_stat() function to do so.
ggplot(diamonds, aes(x = price, y = after_stat(density))) +
geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75)
# visually simpler plot for exploring this relationship is using side-by-side boxplots.
ggplot(diamonds, aes(x = cut, y = price)) +
geom_boxplot()
ggplot(mpg, aes(x = class, y = hwy)) +
geom_boxplot() # quite scattered
ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
geom_boxplot() # reorder class based on the median value of hwy:
ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
geom_boxplot() +
coord_flip() # to help with long names
# Use what you’ve learned to improve the visualization of the departure times of cancelled vs. non-cancelled flights.
nycflights13::flights |>
mutate(
cancelled = is.na(dep_time)
) |>
ggplot(aes(x = sched_dep_time, y = after_stat(density))) +
geom_freqpoly(aes(color = cancelled), binwidth = 3)
# Based on EDA, what variable in the diamonds dataset appears to be most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?
# TO DO
# Instead of exchanging the x and y variables, add coord_flip() as a new layer to the vertical boxplot to create a horizontal one. How does this compare to exchanging the variables?
# no difference apparently
ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
geom_boxplot() +
coord_flip()
ggplot(mpg, aes(y = fct_reorder(class, hwy, median), x = hwy)) +
geom_boxplot()
# One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs. cut. What do you learn? How do you interpret the plots?
diamonds |> ggplot(aes(x = cut, y = price)) +
geom_lv()
# useful for larger datasets, many more quantiles.
# Create a visualization of diamond prices vs. a categorical variable from the diamonds dataset using geom_violin(), then a faceted geom_histogram(), then a colored geom_freqpoly(), and then a colored geom_density(). Compare and contrast the four plots. What are the pros and cons of each method of visualizing the distribution of a numerical variable based on the levels of a categorical variable?
# no overlaps but look the same after 5000 count,
diamonds |> ggplot(aes(x = color, y = price)) +
geom_violin()
# no overlaps but quite small
diamonds %>%
ggplot(aes(x = price)) +
geom_histogram(show.legend = F) +
facet_wrap(~color)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# not too bad, gets hard to read when counts are similiar
diamonds |> ggplot(aes(x = price, color = color)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# overlaps but easier to see differences
diamonds |> ggplot(aes(x = price, fill = color)) +
geom_density()
# If you have a small dataset, it’s sometimes useful to use geom_jitter() to avoid overplotting to more easily see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.
# visit https://github.com/eclarke/ggbeeswarm
# geom_beeswarm - a beewarm?
# geom_quasirandom - imilarly to geom_jitter but reducing overplotting using a van der Corput sequence or Tukey texturing.
# count the number of observations for each combination of levels of these categorical variables. One way to do that is to rely on the built-in geom_count():
ggplot(diamonds, aes(x = cut, y = color)) +
geom_count()
# alternative
diamonds |>
count(color, cut) |>
arrange(desc(n))
## # A tibble: 35 × 3
## color cut n
## <ord> <ord> <int>
## 1 G Ideal 4884
## 2 E Ideal 3903
## 3 F Ideal 3826
## 4 H Ideal 3115
## 5 G Premium 2924
## 6 D Ideal 2834
## 7 E Very Good 2400
## 8 H Premium 2360
## 9 E Premium 2337
## 10 F Premium 2331
## # ℹ 25 more rows
diamonds |>
count(color, cut) |>
ggplot(aes(x = color, y = cut)) +
geom_tile(aes(fill = n))
# 10.5.2.1 Exercises
# How could you rescale the count dataset above to more clearly show the distribution of cut within color, or color within cut?
diamonds |>
count(color, cut) |>
group_by(color) |>
mutate(percent_cut = n / sum(n))
## # A tibble: 35 × 4
## # Groups: color [7]
## color cut n percent_cut
## <ord> <ord> <int> <dbl>
## 1 D Fair 163 0.0241
## 2 D Good 662 0.0977
## 3 D Very Good 1513 0.223
## 4 D Premium 1603 0.237
## 5 D Ideal 2834 0.418
## 6 E Fair 224 0.0229
## 7 E Good 933 0.0952
## 8 E Very Good 2400 0.245
## 9 E Premium 2337 0.239
## 10 E Ideal 3903 0.398
## # ℹ 25 more rows
diamonds |>
count(color, cut) |>
group_by(cut) |> #
mutate(percent_color = n / sum(n))
## # A tibble: 35 × 4
## # Groups: cut [5]
## color cut n percent_color
## <ord> <ord> <int> <dbl>
## 1 D Fair 163 0.101
## 2 D Good 662 0.135
## 3 D Very Good 1513 0.125
## 4 D Premium 1603 0.116
## 5 D Ideal 2834 0.132
## 6 E Fair 224 0.139
## 7 E Good 933 0.190
## 8 E Very Good 2400 0.199
## 9 E Premium 2337 0.169
## 10 E Ideal 3903 0.181
## # ℹ 25 more rows
# What different data insights do you get with a segmented bar chart if color is mapped to the x aesthetic and cut is mapped to the fill aesthetic? Calculate the counts that fall into each of the segments.
diamonds |> ggplot(aes(x = color, fill = cut)) +
geom_bar()
diamonds_counts <- diamonds %>%
count(color, cut)
ggplot(diamonds_counts, aes(x = color, y = n, fill = cut)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), position = position_stack(vjust = 0.2))
# Use geom_tile() together with dplyr to explore how average flight departure delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?
nycflights13::flights %>%
group_by(month, dest) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
group_by(dest) %>%
ungroup() %>%
mutate(dest = reorder(dest, dep_delay)) %>%
ggplot(aes(
x = factor(month.name[month], levels = month.name),
y = dest,
fill = dep_delay
)) +
geom_tile() +
labs(x = "Month", y = "Destination", fill = "Departure Delay") +
scale_fill_gradient(low = "white", high = "red") +
theme(
axis.text.y = element_text(size = 4),
axis.text.x = element_text(size = 7),
legend.position = "bottom"
)
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(smaller, aes(x = carat, y = price)) +
geom_point()
ggplot(smaller, aes(x = carat, y = price)) +
geom_point(alpha = 1 / 100) # using the alpha aesthetic to add transparency.
ggplot(smaller, aes(x = carat, y = price)) +
geom_bin2d()
ggplot(smaller, aes(x = carat, y = price)) +
geom_hex()
ggplot(smaller, aes(x = carat, y = price)) +
geom_boxplot(aes(group = cut_width(carat, 0.1)))
# Instead of summarizing the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs. cut_number()? How does that impact a visualization of the 2d distribution of carat and price?
# Visualize the distribution of carat, partitioned by price.
# How does the price distribution of very large diamonds compare to small diamonds? Is it as you expect, or does it surprise you?
# Combine two of the techniques you’ve learned to visualize the combined distribution of cut, carat, and price.
# Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the following plot have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately. Why is a scatterplot a better display than a binned plot for this case?
diamonds |>
filter(x >= 4) |>
ggplot(aes(x = x, y = y)) +
geom_point() +
coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
# Instead of creating boxes of equal width with cut_width(), we could create boxes that contain roughly equal number of points with cut_number(). What are the advantages and disadvantages of this approach?
ggplot(smaller, aes(x = carat, y = price)) +
geom_boxplot(aes(group = cut_number(carat, 20)))
diamonds <- diamonds |>
mutate(
log_price = log(price),
log_carat = log(carat)
)
diamonds_fit <- linear_reg() |>
fit(log_price ~ log_carat, data = diamonds)
diamonds_aug <- augment(diamonds_fit, new_data = diamonds) |>
mutate(.resid = exp(.resid))
ggplot(diamonds_aug, aes(x = carat, y = .resid)) +
geom_point()
ggplot(diamonds_aug, aes(x = cut, y = .resid)) +
geom_boxplot()
Chapter 11
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
geom_smooth(se = FALSE) +
labs(
x = "Engine displacement (L)",
y = "Highway fuel economy (mpg)",
color = "Car type",
title = "Fuel efficiency generally decreases with engine size",
subtitle = "Two seaters (sports cars) are an exception because of their light weight",
caption = "Data from fueleconomy.gov"
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df <- tibble(
x = 1:10,
y = cumsum(x^2)
)
ggplot(df, aes(x, y)) +
geom_point() +
labs(
x = quote(x[i]),
y = quote(sum(x[i]^2, i == 1, n))
)
# Create one plot on the fuel economy data with customized title, subtitle, caption, x, y, and color labels.
names(mpg)
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
?mpg
mpg |> ggplot(aes(x = displ, y = cty, color = fl)) +
geom_point() +
labs(
x = "Engine displacement (L)",
y = "City fuel economy (mpg)",
color = "Fue",
title = "Disel tends",
subtitle = "Two seaters (sports cars) are an exception because of their light weight",
caption = "Data from fueleconomy.gov"
)
Label exercises
mpg |> ggplot(aes(cty, hwy, color = factor(class))) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = F) + # color to prevent multiple lines per class
labs(
x = "MPG in the city",
y = "MPG on the highway",
color = "Car class",
title = "Highway vs. city MPG",
subtitle = "SUVs are terrible!"
)
## `geom_smooth()` using formula = 'y ~ x'
Recreate the following plot using the fuel economy data. Note that both the colors and shapes of points vary by type of drive train.
mpg |> ggplot(aes(cty, hwy)) +
geom_point(aes(color = factor(drv), shape = factor(drv))) +
labs(
x = "MPG in the city",
y = "MPG on the highway",
color = "Type of drive train",
shape = "Type of drive train", # identical to avoid two legends
title = "Highway vs. city MPG",
subtitle = "SUVs are terrible!"
)
11.3 Annotations!
label_info <- mpg |>
group_by(drv) |>
arrange(desc(displ)) |>
slice_head(n = 1) |> # pull out the cars with the highest engine size in each drive type
mutate(
drive_type = case_when(
drv == "f" ~ "front-wheel drive",
drv == "r" ~ "rear-wheel drive",
drv == "4" ~ "4-wheel drive"
)
) |>
select(displ, hwy, drv, drive_type)
label_info
## # A tibble: 3 × 4
## # Groups: drv [3]
## displ hwy drv drive_type
## <dbl> <int> <chr> <chr>
## 1 6.5 17 4 4-wheel drive
## 2 5.3 25 f front-wheel drive
## 3 7 24 r rear-wheel drive
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point(alpha = 0.3) +
geom_smooth(se = FALSE) +
geom_text(
data = label_info,
aes(x = displ, y = hwy, label = drive_type),
fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
) +
theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point(alpha = 0.3) +
geom_smooth(se = FALSE) +
geom_text(
data = label_info,
aes(x = displ, y = hwy, label = drive_type),
fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
) + # hjust (horizontal justification) and vjust (vertical justification) to control the alignment of the label.
theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point(alpha = 0.3) +
geom_smooth(se = FALSE) +
geom_label_repel( # We can use the geom_label_repel() function from the ggrepel package to address overlap
data = label_info,
aes(x = displ, y = hwy, label = drive_type),
fontface = "bold", size = 5, nudge_y = 2 # Using the fontface and size arguments we can customize the look of the text labels
) +
theme(legend.position = "none") # (theme(legend.position = "none") turns all the legends off
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
potential_outliers <- mpg |>
filter(hwy > 40 | (hwy > 20 & displ > 5))
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_text_repel(data = potential_outliers, aes(label = model)) +
geom_point(data = potential_outliers, color = "red") + # make outlier points red
geom_point(
data = potential_outliers,
color = "red", size = 3, shape = "circle open"
) # circle around circle
trend_text <- "Larger engine sizes tend to have lower fuel economy." |>
str_wrap(width = 30)
trend_text
## [1] "Larger engine sizes tend to\nhave lower fuel economy."
# annotation!
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
annotate(
geom = "label", x = 3.5, y = 38,
label = trend_text,
hjust = "left", color = "red"
) +
annotate(
geom = "segment",
x = 3, y = 35, xend = 5, yend = 25, color = "red",
arrow = arrow(type = "closed")
)
11.3.1 Exercises with Annotations Use geom_text() with infinite positions to place text at the four corners of the plot.
corner_labels <- tibble(
x = c(-Inf, -Inf, Inf, Inf),
y = c(-Inf, Inf, -Inf, Inf),
label = c(
"(x0,y0)", "(x0,y1)",
"(x1,y0)", "(x1,y1)"
),
hjust = c(0, 0, 1, 1),
vjust = c(0, 1, 0, 1)
)
annoation_plot <- mpg |> ggplot(aes(x = displ, y = hwy)) +
geom_point() +
geom_text(data = corner_labels, aes(
x = x, y = y, hjust = hjust, vjust = vjust, label = label
))
annoation_plot
Use annotate() to add a point geom in the middle of your last plot
without having to create a tibble. Customize the shape, size, or color
of the point.
annoation_plot + annotate(
geom = "point", x = 4.5, y = 30,
color = "pink", size = 15, shape = 15
)
How do labels with geom_text() interact with faceting? How can you add a label to a single facet? How can you put a different label in each facet? (Hint: Think about the dataset that is being passed to geom_text().)
label_for_all <- tibble(
x = Inf,
y = Inf,
label = "text for all",
vjust = "inward",
hjust = "inward",
angle = 0,
)
mpg |> ggplot(aes(displ, hwy)) +
geom_point() +
facet_wrap(~class) +
geom_text(data = label_for_all, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))
Label for a single facet
label_for_minivan <- tibble(
x = Inf,
y = Inf,
label = "text for minivan",
vjust = "inward",
hjust = "inward",
angle = 0,
class = "minivan"
)
mpg |> ggplot(aes(displ, hwy)) +
geom_point() +
facet_wrap(~class) +
geom_text(data = label_for_minivan, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))
Labels for different facets
# different facets
label_different <- mpg |>
group_by(class) |>
summarise(mean_mpg = round(mean(hwy), 1)) |>
mutate(
x = Inf,
y = Inf,
label = paste(class, "average highway mpg:", mean_mpg),
vjust = "inward",
hjust = "inward",
angle = 0
)
mpg |> ggplot(aes(displ, hwy)) +
geom_point() +
facet_wrap(~class) +
geom_text(data = label_different, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))
Playing with hjust and vjust
td <- expand.grid(
hjust = c(0, 0.5, 1),
vjust = c(0, 0.5, 1),
angle = c(0, 45, 90),
text = "text"
)
ggplot(td, aes(x = hjust, y = vjust)) +
geom_point() +
geom_text(aes(label = text, angle = angle, hjust = hjust, vjust = vjust)) +
facet_grid(~angle) +
scale_x_continuous(breaks = c(0, 0.5, 1), expand = c(0, 0.2)) +
scale_y_continuous(breaks = c(0, 0.5, 1), expand = c(0, 0.2))
Corner Labels
corner_labels <- tibble(
x = c(-Inf, -Inf, Inf, Inf),
y = c(-Inf, Inf, -Inf, Inf),
label = c(
"(x0,y0)", "(x0,y1)",
"(x1,y0)", "(x1,y1)"
),
vjust = c(0, 0, 1, 1),
hjust = c(0, 1, 0, 1)
)
annoation_plot <- mpg |> ggplot(aes(x = displ, y = hwy)) +
geom_point() +
geom_text(data = corner_labels, aes(
x = x, y = y, hjust = hjust, vjust = vjust, label = label
))
annoation_plot
What arguments to geom_label() control the appearance of the background box?
mpg |> ggplot(aes(x = displ, y = hwy)) +
geom_point() +
geom_label(data = data.frame(), label.padding = unit(0.55, "lines"), label.size = 1, color = "blue", size = 10, aes(
x = Inf, y = Inf, hjust = "inward", vjust = "inward", label = "Hello!"
))
What are the four arguments to arrow()? How do they work? Create a series of plots that demonstrate the most important options.
mpg |> ggplot(aes(x = displ, y = cty)) +
geom_point() +
annotate(
geom = "segment",
x = 3, y = 35, xend = 5, yend = 25, color = "red",
arrow = arrow(type = "open", angle = 45, ends = "last")
) +
annotate(
geom = "segment",
x = 2, y = 35, xend = 4, yend = 25, color = "blue",
arrow = arrow(type = "closed", angle = 35, ends = "first", length = unit(1, "cm"))
) +
annotate(
geom = "segment",
x = 2, y = 35, xend = 4, yend = 15, color = "green",
arrow = arrow(type = "closed", angle = 35, ends = "last")
)
Scales
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
scale_x_continuous() +
scale_y_continuous() +
scale_color_discrete()
Axis ticks and legend keys
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
scale_y_continuous(breaks = seq(15, 40, by = 5))
Removing Ticks
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
scale_x_continuous(labels = NULL) +
scale_y_continuous(labels = NULL) +
scale_color_discrete(labels = c("4" = "4-wheel", "f" = "front", "r" = "rear")) # for legend
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
scale_x_continuous(labels = NULL) +
scale_y_continuous(labels = NULL)
Dollars in the scale
library(scales)
ggplot(diamonds, aes(x = price, y = cut)) +
geom_boxplot(alpha = 0.05) +
scale_x_continuous(labels = label_dollar()) # uses scales package
More custom dollars (1k, 7k, etc.)
ggplot(diamonds, aes(x = price, y = cut)) +
geom_boxplot(alpha = 0.05) +
scale_x_continuous(
labels = label_dollar(scale = 1/1000, suffix = "K"),
breaks = seq(1000, 19000, by = 6000)
)
Label percente for bar charts
ggplot(diamonds, aes(x = cut, fill = clarity)) +
geom_bar(position = "fill") +
scale_y_continuous(name = "Percentage", labels = label_percent())
Breaks for only a few data points
presidential |>
mutate(id = 33 + row_number()) |>
ggplot(aes(x = start, y = id)) +
geom_point() +
geom_segment(aes(xend = end, yend = id)) +
scale_x_date(name = NULL, breaks = presidential$start, date_labels = "'%y")
Position of the legend
base <- ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class))
base + theme(legend.position = "right") # the default
base + theme(legend.position = "left")
base +
theme(legend.position = "top") +
guides(color = guide_legend(nrow = 3))
base +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 2))
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
geom_smooth(se = FALSE) +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 2, override.aes = list(size = 6))) # larger legend symbols
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
11.4.4 Replacing a scale
ggplot(diamonds, aes(x = carat, y = price)) +
geom_bin2d()
# better to log transform
ggplot(diamonds, aes(x = log10(carat), y = log10(price))) +
geom_bin2d()
adjust scale to log-transform
ggplot(diamonds, aes(x = carat, y = price)) +
geom_bin2d() +
scale_x_log10() +
scale_y_log10()
change color scale
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = drv)) +
scale_color_brewer(palette = "Set1")
custom colors scale (blue democrat, red = republican)
presidential |>
mutate(id = 33 + row_number()) |>
ggplot(aes(x = start, y = id, color = party)) +
geom_point() +
geom_segment(aes(xend = end, yend = id)) +
scale_color_manual(values = c(Republican = "#E81B23", Democratic = "#00AEF3"))
dealing with color blindness
df <- tibble(
x = rnorm(10000),
y = rnorm(10000)
)
ggplot(df, aes(x, y)) +
geom_hex() +
coord_fixed() +
labs(title = "Default, continuous", x = NULL, y = NULL)
ggplot(df, aes(x, y)) +
geom_hex() +
coord_fixed() +
scale_fill_viridis_c() +
labs(title = "Viridis, continuous", x = NULL, y = NULL)
ggplot(df, aes(x, y)) +
geom_hex() +
coord_fixed() +
scale_fill_viridis_b() +
labs(title = "Viridis, binned", x = NULL, y = NULL)
11.4.5 Zooming
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = drv)) +
geom_smooth() +
scale_x_continuous(limits = c(5, 6)) +
scale_y_continuous(limits = c(10, 25))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 202 rows containing missing values or values outside the scale range
## (`geom_point()`).
Using coord_cartesian to set limits
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = drv)) +
geom_smooth() +
coord_cartesian(xlim = c(5, 6), ylim = c(10, 25))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Random packages (janitor, skimr)
library(ryouwithme)
# exploring
skim(mpg)
| Name | mpg |
| Number of rows | 234 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| manufacturer | 0 | 1 | 4 | 10 | 0 | 15 | 0 |
| model | 0 | 1 | 2 | 22 | 0 | 38 | 0 |
| trans | 0 | 1 | 8 | 10 | 0 | 10 | 0 |
| drv | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| fl | 0 | 1 | 1 | 1 | 0 | 5 | 0 |
| class | 0 | 1 | 3 | 10 | 0 | 7 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| displ | 0 | 1 | 3.47 | 1.29 | 1.6 | 2.4 | 3.3 | 4.6 | 7 | ▇▆▆▃▁ |
| year | 0 | 1 | 2003.50 | 4.51 | 1999.0 | 1999.0 | 2003.5 | 2008.0 | 2008 | ▇▁▁▁▇ |
| cyl | 0 | 1 | 5.89 | 1.61 | 4.0 | 4.0 | 6.0 | 8.0 | 8 | ▇▁▇▁▇ |
| cty | 0 | 1 | 16.86 | 4.26 | 9.0 | 14.0 | 17.0 | 19.0 | 35 | ▆▇▃▁▁ |
| hwy | 0 | 1 | 23.44 | 5.95 | 12.0 | 18.0 | 24.0 | 27.0 | 44 | ▅▅▇▁▁ |
mpg |>
tabyl(year, manufacturer)
## year audi chevrolet dodge ford honda hyundai jeep land rover lincoln mercury
## 1999 9 7 16 15 5 6 2 2 2 2
## 2008 9 12 21 10 4 8 6 2 1 2
## nissan pontiac subaru toyota volkswagen
## 6 3 6 20 16
## 7 2 8 14 11
# cleaning
beaches <- sydneybeaches
beaches <- clean_names(beaches) # all lowercase with underscores
glimpse(beaches)
## Rows: 3,690
## Columns: 8
## $ beach_id <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, …
## $ region <chr> "Sydney City Ocean Beaches", "Sydney City Ocean …
## $ council <chr> "Randwick Council", "Randwick Council", "Randwic…
## $ site <chr> "Clovelly Beach", "Clovelly Beach", "Clovelly Be…
## $ longitude <dbl> 151.2675, 151.2675, 151.2675, 151.2675, 151.2675…
## $ latitude <dbl> -33.91449, -33.91449, -33.91449, -33.91449, -33.…
## $ date <chr> "02/01/2013", "06/01/2013", "12/01/2013", "18/01…
## $ enterococci_cfu_100ml <dbl> 19, 3, 2, 13, 8, 7, 11, 97, 3, 0, 6, 0, 1, 8, 3,…
beaches <- beaches |>
rename(
beachbugs = enterococci_cfu_100ml
)
Most beach bugs
worst_day_coogee <- beaches |>
arrange(desc(beachbugs)) |>
filter(site == "Coogee Beach") |>
head(1)
worst_day_little_bay <- beaches |>
arrange(desc(beachbugs)) |>
filter(site == "Little Bay Beach") |>
head(1)
worst_day_coogee$beachbugs # 1200
## [1] 1200
worst_day_little_bay$beachbugs # 4900
## [1] 4900
Question B: Does Coogee or Bondi have more extreme bacteria levels? Which beach has the worst bacteria levels on average?
bugs_beach <- beaches %>%
group_by(site) %>%
summarise(avg_bug = mean(beachbugs, na.rm = TRUE)) |>
arrange(desc(avg_bug))
bugs_beach # Malabar has worst on average
## # A tibble: 11 × 2
## site avg_bug
## <chr> <dbl>
## 1 Malabar Beach 68.1
## 2 South Maroubra Rockpool 63.9
## 3 Little Bay Beach 45.6
## 4 Coogee Beach 39.4
## 5 Tamarama Beach 35.7
## 6 Bronte Beach 31.4
## 7 Gordons Bay (East) 24.9
## 8 Maroubra Beach 20.2
## 9 Bondi Beach 18.8
## 10 South Maroubra Beach 15.7
## 11 Clovelly Beach 10.2
bugs_beach %>%
filter(str_detect(site, "Coogee|Bondi")) # Cooggee is worse
## # A tibble: 2 × 2
## site avg_bug
## <chr> <dbl>
## 1 Coogee Beach 39.4
## 2 Bondi Beach 18.8
Total beachbugs observations per site
beaches |>
group_by(site) |>
summarise(tl_bug = sum(beachbugs, na.rm = T)) |>
arrange(desc(tl_bug))
## # A tibble: 11 × 2
## site tl_bug
## <chr> <dbl>
## 1 Malabar Beach 23227
## 2 South Maroubra Rockpool 20064
## 3 Little Bay Beach 15325
## 4 Coogee Beach 13349
## 5 Tamarama Beach 11969
## 6 Bronte Beach 10526
## 7 Gordons Bay (East) 8018
## 8 Maroubra Beach 6760
## 9 Bondi Beach 6271
## 10 South Maroubra Beach 5277
## 11 Clovelly Beach 3413
Custom Theme
theme_jen <- function() {
# define font up front
font <- "Helvetica"
# this theme uses theme_bw as the base
theme_bw() %+replace%
theme(
# get rid of grid lines/borders
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# add white space top, right, bottom, left
plot.margin = unit(c(1, 1, 1, 1), "cm"),
# custom axis title/text/lines
axis.title = element_text(
family = font,
size = 14
),
axis.text = element_text(
family = font,
size = 12
),
# margin pulls text away from axis
axis.text.x = element_text(
margin = margin(5, b = 10)
),
# black lines
axis.line = element_line(colour = "black", size = rel(1)),
# custom plot titles, subtitles, captions
plot.title = element_text(
family = font,
size = 18,
hjust = -0.1,
vjust = 4
),
plot.subtitle = element_text(
family = font,
size = 14,
hjust = 0,
vjust = 3
),
plot.caption = element_text(
family = font,
size = 10,
hjust = 1,
vjust = 2
),
# custom legend
legend.title = element_text(
family = font,
size = 10,
hjust = 0
),
legend.text = element_text(
family = font,
size = 8,
hjust = 0
),
# no background on legend
legend.key = element_blank(),
# white background on plot
strip.background = element_rect(
fill = "white",
colour = "black", size = rel(2)
), complete = TRUE
)
}
GGEasy
tt <- tt_load("2021-01-05")
## --- Compiling #TidyTuesday Information for 2021-01-05 ----
## --- There is 1 file available ---
## --- Starting Download ---
##
## Downloading file 1 of 1: `transit_cost.csv`
## --- Download complete ---
cost <- tt$transit_cost
easy_expand_y_axis <- function() {
scale_y_continuous(expand = c(0, 0))
}
cost %>%
group_by(country) %>%
summarise(meancost = mean(cost_km_millions)) %>%
arrange(-meancost) %>%
head(5) %>%
ggplot(aes(x = reorder(country, meancost), y = meancost, fill = country)) +
geom_col() +
labs(
y = "Average cost per km (million)", x = "Country",
title = "Countries with the most expensive transit projects",
caption = "why is the US so $$$$?"
) +
theme_jen() +
easy_remove_legend() +
easy_expand_y_axis() +
ggdark::dark_theme_dark()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().